solana/ci/testnet-deploy.sh

166 lines
4.3 KiB
Bash
Raw Normal View History

#!/bin/bash -e
2018-07-12 19:47:07 -07:00
#
2018-07-16 12:05:48 -07:00
# Deploys the Solana software running on the testnet full nodes
2018-07-12 19:47:07 -07:00
#
# This script must be run by a user/machine that has successfully authenticated
# with GCP and has sufficient permission.
#
2018-07-16 12:05:48 -07:00
cd "$(dirname "$0")/.."
2018-07-18 20:52:14 -07:00
# TODO: Switch over to rolling updates
ROLLING_UPDATE=false
#ROLLING_UPDATE=true
2018-07-12 19:47:07 -07:00
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
echo Error: SOLANA_METRICS_CONFIG environment variable is unset
exit 1
fi
2018-07-16 09:42:54 -07:00
# Default to edge channel. To select the beta channel:
# export SOLANA_SNAP_CHANNEL=beta
2018-07-12 19:47:07 -07:00
if [[ -z $SOLANA_SNAP_CHANNEL ]]; then
2018-07-16 09:42:54 -07:00
SOLANA_SNAP_CHANNEL=edge
2018-07-12 19:47:07 -07:00
fi
2018-07-16 09:42:54 -07:00
case $SOLANA_SNAP_CHANNEL in
edge)
2018-07-16 11:01:07 -07:00
publicUrl=master.testnet.solana.com
publicIp=$(dig +short $publicUrl | head -n1)
2018-07-16 09:42:54 -07:00
;;
beta)
2018-07-16 11:01:07 -07:00
publicUrl=testnet.solana.com
2018-07-18 08:21:48 -07:00
publicIp="" # Use default value
2018-07-16 09:42:54 -07:00
;;
*)
echo Error: Unknown SOLANA_SNAP_CHANNEL=$SOLANA_SNAP_CHANNEL
exit 1
;;
esac
2018-07-16 11:01:07 -07:00
resourcePrefix=${publicUrl//./-}
2018-07-16 09:42:54 -07:00
vmlist=("$resourcePrefix":us-west1-b) # Leader is hard coded as the first entry
validatorNamePrefix=$resourcePrefix-validator-
2018-07-12 19:47:07 -07:00
2018-07-16 09:42:54 -07:00
echo "--- Available validators for $publicUrl"
filter="name~^$validatorNamePrefix"
gcloud compute instances list --filter="$filter"
2018-07-12 19:47:07 -07:00
while read -r vmName vmZone status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $vmName is not RUNNING, ignoring it."
continue
fi
vmlist+=("$vmName:$vmZone")
2018-07-16 09:42:54 -07:00
done < <(gcloud compute instances list --filter="$filter" --format 'value(name,zone,status)')
2018-07-12 20:08:35 -07:00
2018-07-17 19:39:43 -07:00
wait_for_node() {
declare pid=$1
declare ok=true
wait "$pid" || ok=false
cat "log-$pid.txt"
if ! $ok; then
echo ^^^ +++
exit 1
fi
}
2018-07-18 20:52:14 -07:00
if ! $ROLLING_UPDATE; then
count=1
for info in "${vmlist[@]}"; do
nodePosition="($count/${#vmlist[*]})"
vmName=${info%:*}
vmZone=${info#*:}
echo "--- Shutting down $vmName in zone $vmZone $nodePosition"
gcloud compute ssh "$vmName" --zone "$vmZone" \
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
--command="echo sudo snap remove solana" &
2018-07-19 10:11:04 -07:00
if [[ $((count % 5)) = 0 ]]; then
2018-07-18 20:52:14 -07:00
# Slow down deployment to avoid triggering GCP login
# quota limits (each |ssh| counts as a login)
sleep 3
fi
count=$((count + 1))
done
wait
fi
2018-07-17 19:39:43 -07:00
2018-07-16 09:42:54 -07:00
echo "--- Refreshing leader for $publicUrl"
leader=true
2018-07-17 19:39:43 -07:00
pids=()
count=1
2018-07-12 19:47:07 -07:00
for info in "${vmlist[@]}"; do
nodePosition="($count/${#vmlist[*]})"
2018-07-12 19:47:07 -07:00
vmName=${info%:*}
vmZone=${info#*:}
echo "Starting refresh for $vmName $nodePosition"
2018-07-12 19:47:07 -07:00
(
SECONDS=0
echo "--- $vmName in zone $vmZone $nodePosition"
commonNodeConfig="\
rust-log=$RUST_LOG \
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
metrics-config=$SOLANA_METRICS_CONFIG \
"
if $leader; then
nodeConfig="mode=leader+drone $commonNodeConfig"
2018-07-16 16:19:49 -07:00
if [[ -n $SOLANA_CUDA ]]; then
nodeConfig="$nodeConfig enable-cuda=1"
fi
else
nodeConfig="mode=validator leader-address=$publicIp $commonNodeConfig"
fi
2018-07-12 19:47:07 -07:00
set -x
gcloud compute ssh "$vmName" --zone "$vmZone" \
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -t" \
--command="\
set -ex; \
logmarker='solana deploy $(date)/$RANDOM'; \
sudo snap remove solana; \
logger \$logmarker; \
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \
sudo snap set solana $nodeConfig; \
snap info solana; \
echo Slight delay to get more syslog output; \
sleep 2; \
sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \
"
echo "Succeeded in ${SECONDS} seconds"
2018-07-12 20:08:35 -07:00
) > "log-$vmName.txt" 2>&1 &
2018-07-17 19:39:43 -07:00
pid=$!
# Rename log file so it can be discovered later by $pid
mv "log-$vmName.txt" "log-$pid.txt"
if $leader; then
echo Waiting for leader...
# Wait for the leader to initialize before starting the validators
# TODO: Remove this limitation eventually.
2018-07-17 19:39:43 -07:00
wait_for_node "$pid"
echo "--- Refreshing validators"
else
# Slow down deployment to ~20 machines a minute to avoid triggering GCP login
# quota limits (each |ssh| counts as a login)
2018-07-17 19:39:43 -07:00
sleep 3
2018-07-17 19:39:43 -07:00
pids+=("$pid")
fi
leader=false
count=$((count + 1))
2018-07-12 19:47:07 -07:00
done
echo --- Waiting for validators
2018-07-17 19:39:43 -07:00
for pid in "${pids[@]}"; do
wait_for_node "$pid"
2018-07-12 20:08:35 -07:00
done
2018-07-16 09:42:54 -07:00
echo "--- $publicUrl sanity test"
2018-07-16 12:05:48 -07:00
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl
2018-07-12 19:47:07 -07:00
exit 0