solana/ci/testnet-deploy.sh

475 lines
13 KiB
Bash
Raw Normal View History

#!/bin/bash -e
2018-07-12 19:47:07 -07:00
#
2018-07-16 12:05:48 -07:00
# Deploys the Solana software running on the testnet full nodes
2018-07-12 19:47:07 -07:00
#
# This script must be run by a user/machine that has successfully authenticated
# with GCP and has sufficient permission.
#
here=$(dirname "$0")
metrics_write_datapoint="$here"/../scripts/metrics-write-datapoint.sh
2018-07-16 12:05:48 -07:00
2018-07-18 20:52:14 -07:00
# TODO: Switch over to rolling updates
ROLLING_UPDATE=false
#ROLLING_UPDATE=true
2018-07-12 19:47:07 -07:00
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
echo Error: SOLANA_METRICS_CONFIG environment variable is unset
exit 1
fi
2018-07-16 09:42:54 -07:00
# Default to edge channel. To select the beta channel:
# export SOLANA_SNAP_CHANNEL=beta
2018-07-12 19:47:07 -07:00
if [[ -z $SOLANA_SNAP_CHANNEL ]]; then
2018-07-16 09:42:54 -07:00
SOLANA_SNAP_CHANNEL=edge
2018-07-12 19:47:07 -07:00
fi
# Select default network URL based on SOLANA_SNAP_CHANNEL if SOLANA_NET_ENTRYPOINT is
2018-07-20 08:50:08 -07:00
# unspecified
if [[ -z $SOLANA_NET_ENTRYPOINT ]]; then
2018-07-20 08:50:08 -07:00
case $SOLANA_SNAP_CHANNEL in
edge)
SOLANA_NET_ENTRYPOINT=master.testnet.solana.com
unset SOLANA_NET_NAME
2018-07-20 08:50:08 -07:00
;;
beta)
SOLANA_NET_ENTRYPOINT=testnet.solana.com
unset SOLANA_NET_NAME
2018-07-20 08:50:08 -07:00
;;
*)
echo Error: Unknown SOLANA_SNAP_CHANNEL=$SOLANA_SNAP_CHANNEL
exit 1
;;
esac
fi
if [[ -z $SOLANA_NET_NAME ]]; then
SOLANA_NET_NAME=${SOLANA_NET_ENTRYPOINT//./-}
fi
: ${SOLANA_NET_NAME:?$SOLANA_NET_ENTRYPOINT}
2018-08-17 21:59:27 -07:00
netBasename=${SOLANA_NET_NAME/-testnet-solana-com/}
if [[ $netBasename != testnet ]]; then
netBasename="testnet-$netBasename"
fi
# Figure installation command
SNAP_INSTALL_CMD="\
for i in {1..3}; do \
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode && break;
sleep 1; \
done \
"
LOCAL_SNAP=$1
if [[ -n $LOCAL_SNAP ]]; then
if [[ ! -f $LOCAL_SNAP ]]; then
echo "Error: $LOCAL_SNAP is not a file"
exit 1
fi
SNAP_INSTALL_CMD="sudo snap install ~/solana_local.snap --devmode --dangerous"
fi
SNAP_INSTALL_CMD="sudo snap remove solana; $SNAP_INSTALL_CMD"
2018-08-07 16:16:11 -07:00
EARLYOOM_INSTALL_CMD="\
2018-08-16 20:14:54 -07:00
wget --retry-connrefused --waitretry=1 \
--read-timeout=20 --timeout=15 --tries=5 \
-O install-earlyoom.sh \
https://raw.githubusercontent.com/solana-labs/solana/master/ci/install-earlyoom.sh; \
2018-08-07 16:16:11 -07:00
bash install-earlyoom.sh \
"
SNAP_INSTALL_CMD="$EARLYOOM_INSTALL_CMD; $SNAP_INSTALL_CMD"
# `export SKIP_INSTALL=1` to reset the network without reinstalling the snap
if [[ -n $SKIP_INSTALL ]]; then
SNAP_INSTALL_CMD="echo Install skipped"
fi
echo "+++ Configuration for $netBasename"
publicUrl="$SOLANA_NET_ENTRYPOINT"
2018-07-20 08:50:08 -07:00
if [[ $publicUrl = testnet.solana.com ]]; then
2018-07-18 08:21:48 -07:00
publicIp="" # Use default value
2018-07-20 08:50:08 -07:00
else
publicIp=$(dig +short $publicUrl | head -n1)
fi
echo "Network name: $SOLANA_NET_NAME"
echo "Network entry point URL: $publicUrl ($publicIp)"
2018-07-20 08:50:08 -07:00
echo "Snap channel: $SOLANA_SNAP_CHANNEL"
echo "Install command: $SNAP_INSTALL_CMD"
2018-08-08 08:32:23 -07:00
echo "Setup args: $SOLANA_SETUP_ARGS"
[[ -z $LOCAL_SNAP ]] || echo "Local snap: $LOCAL_SNAP"
2018-07-20 08:50:08 -07:00
vmlist=() # Each array element is formatted as "class:vmName:vmZone:vmPublicIp"
vm_exec() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare message=$4
declare cmd=$5
echo "--- $message $vmName in zone $vmZone ($vmPublicIp)"
ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
testnet-deploy@"$vmPublicIp" "$cmd"
}
#
2018-07-28 11:37:39 -07:00
# vm_foreach [cmd] [extra args to cmd]
# where
2018-07-28 11:37:39 -07:00
# cmd - the command to execute on each VM
# The command will receive three fixed arguments, followed by any
# additionl arguments supplied to vm_foreach:
# vmName - GCP name of the VM
# vmZone - The GCP zone the VM is located in
# vmPublicIp - The public IP address of this VM
2018-07-28 11:37:39 -07:00
# vmClass - The 'class' of this VM
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
2018-07-28 11:37:39 -07:00
# ... - Extra args to cmd..
#
#
2018-07-28 11:37:39 -07:00
vm_foreach() {
declare cmd=$1
shift
declare count=1
for info in "${vmlist[@]}"; do
declare vmClass vmName vmZone vmPublicIp
IFS=: read -r vmClass vmName vmZone vmPublicIp < <(echo "$info")
eval "$cmd" "$vmName" "$vmZone" "$vmPublicIp" "$vmClass" "$count" "$@"
2018-07-28 11:37:39 -07:00
count=$((count + 1))
done
}
2018-07-28 11:37:39 -07:00
#
# vm_foreach_in_class [class] [cmd]
# where
# class - the desired VM class to operate on
# cmd - the command to execute on each VM in the desired class.
# The command will receive three arguments:
# vmName - GCP name of the VM
# vmZone - The GCP zone the VM is located in
# vmPublicIp - The public IP address of this VM
2018-07-28 11:37:39 -07:00
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
#
#
_run_cmd_if_class() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare vmClass=$4
declare count=$5
declare class=$6
declare cmd=$7
2018-07-28 11:37:39 -07:00
if [[ $class = "$vmClass" ]]; then
eval "$cmd" "$vmName" "$vmZone" "$vmPublicIp" "$count"
2018-07-28 11:37:39 -07:00
fi
}
vm_foreach_in_class() {
declare class=$1
declare cmd=$2
vm_foreach _run_cmd_if_class "$1" "$2"
}
2018-07-20 08:50:08 -07:00
2018-07-28 11:37:39 -07:00
#
# Load all VMs matching the specified filter and tag them with the specified
# class into the `vmlist` array.
2018-07-20 08:50:08 -07:00
findVms() {
declare class="$1"
declare filter="$2"
2018-07-20 08:50:08 -07:00
gcloud compute instances list --filter="$filter"
while read -r vmName vmZone vmPublicIp status; do
2018-07-20 08:50:08 -07:00
if [[ $status != RUNNING ]]; then
echo "Warning: $vmName is not RUNNING, ignoring it."
continue
fi
vmlist+=("$class:$vmName:$vmZone:$vmPublicIp")
done < <(gcloud compute instances list \
--filter="$filter" \
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,status)')
2018-07-20 08:50:08 -07:00
}
2018-07-12 20:08:35 -07:00
wait_for_pids() {
echo "--- Waiting for $*"
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
cat "log-$pid.txt"
if ! $ok; then
echo ^^^ +++
exit 1
fi
rm "log-$pid.txt"
done
}
delete_unreachable_validators() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
touch "log-$vmName.txt"
(
SECONDS=0
if ! vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Checking $vmName" uptime; then
echo "^^^ +++"
# Validators are managed by a Compute Engine Instance Group, so deleting
# one will just cause a new one to be spawned.
echo "Warning: $vmName is unreachable, deleting it"
gcloud compute instances delete "$vmName" --zone "$vmZone"
fi
echo "validator checked in ${SECONDS} seconds"
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
echo "Validator nodes (unverified):"
findVms validator "name~^$SOLANA_NET_NAME-validator-"
pids=()
vm_foreach_in_class validator delete_unreachable_validators
wait_for_pids validator sanity check
vmlist=()
2018-07-20 08:50:08 -07:00
echo "Leader node:"
findVms leader "name=$SOLANA_NET_NAME"
2018-07-20 08:50:08 -07:00
[[ ${#vmlist[@]} = 1 ]] || {
echo "Unable to find $SOLANA_NET_NAME"
2018-07-20 08:50:08 -07:00
exit 1
}
echo "Client node(s):"
findVms client "name~^$SOLANA_NET_NAME-client"
2018-07-20 08:50:08 -07:00
echo "Validator nodes:"
findVms validator "name~^$SOLANA_NET_NAME-validator-"
2018-07-17 19:39:43 -07:00
fullnode_count=0
inc_fullnode_count() {
fullnode_count=$((fullnode_count + 1))
}
vm_foreach_in_class leader inc_fullnode_count
vm_foreach_in_class validator inc_fullnode_count
# Add "network stopping" datapoint
$metrics_write_datapoint "testnet-deploy,name=$netBasename stop=1"
client_start() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare count=$4
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
"Starting client $count:" \
"\
set -x;
snap info solana; \
sudo snap get solana; \
2018-07-24 09:38:19 -07:00
threadCount=\$(nproc); \
2018-08-16 10:21:42 -07:00
if [[ \$threadCount -gt 4 ]]; then threadCount=4; fi; \
2018-07-25 16:38:07 -07:00
tmux kill-session -t solana; \
tmux new -s solana -d \" \
set -x; \
sudo rm /tmp/solana.log; \
while : ; do \
/snap/bin/solana.bench-tps $SOLANA_NET_ENTRYPOINT $fullnode_count --loop -s 600 --sustained -t \$threadCount 2>&1 | tee -a /tmp/solana.log; \
echo 'https://metrics.solana.com:8086/write?db=${INFLUX_DATABASE}&u=${INFLUX_USERNAME}&p=${INFLUX_PASSWORD}' \
| xargs curl --max-time 5 -XPOST --data-binary 'testnet-deploy,name=$netBasename clientexit=1'; \
echo Error: bench-tps should never exit | tee -a /tmp/solana.log; \
2018-08-16 09:25:28 -07:00
done; \
bash \
\"; \
sleep 2; \
tmux capture-pane -t solana -p -S -100; \
tail /tmp/solana.log; \
"
}
client_stop() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare count=$4
touch "log-$vmName.txt"
(
SECONDS=0
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
"Stopping client $vmName ($count):" \
"\
set -x;
tmux list-sessions; \
tmux capture-pane -t solana -p; \
tmux kill-session -t solana; \
$SNAP_INSTALL_CMD; \
sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG \
rust-log=$RUST_LOG \
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
; \
"
echo "Client stopped in ${SECONDS} seconds"
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
fullnode_start() {
declare class=$1
declare vmName=$2
declare vmZone=$3
declare vmPublicIp=$4
declare count=$5
2018-07-12 19:47:07 -07:00
touch "log-$vmName.txt"
2018-07-12 19:47:07 -07:00
(
SECONDS=0
commonNodeConfig="\
rust-log=$RUST_LOG \
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
metrics-config=$SOLANA_METRICS_CONFIG \
2018-08-08 08:32:23 -07:00
setup-args=$SOLANA_SETUP_ARGS \
"
if [[ $class = leader ]]; then
nodeConfig="mode=leader+drone $commonNodeConfig"
2018-07-16 16:19:49 -07:00
if [[ -n $SOLANA_CUDA ]]; then
nodeConfig="$nodeConfig enable-cuda=1"
fi
else
nodeConfig="mode=validator leader-address=$publicIp $commonNodeConfig"
fi
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \
"\
set -ex; \
logmarker='solana deploy $(date)/$RANDOM'; \
logger \"\$logmarker\"; \
$SNAP_INSTALL_CMD; \
sudo snap set solana $nodeConfig; \
snap info solana; \
sudo snap get solana; \
echo Slight delay to get more syslog output; \
sleep 2; \
sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \
"
echo "Succeeded in ${SECONDS} seconds"
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
2018-07-17 19:39:43 -07:00
# Rename log file so it can be discovered later by $pid
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
leader_start() {
fullnode_start leader "$@"
}
validator_start() {
fullnode_start validator "$@"
}
fullnode_stop() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare count=$4
touch "log-$vmName.txt"
(
SECONDS=0
# Try to ping the machine first. When a machine (validator) is restarted,
# there can be a delay between when the instance is reported as RUNNING and when
# it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $vmPublicIp | tr - _; do echo .; done"
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\
if snap list solana; then \
sudo snap set solana mode=; \
fi"
echo "Succeeded in ${SECONDS} seconds"
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
if [[ -n $LOCAL_SNAP ]]; then
echo "--- Transferring $LOCAL_SNAP to node(s)"
transfer_local_snap() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare vmClass=$4
declare count=$5
echo "--- $vmName in zone $vmZone ($count)"
SECONDS=0
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$LOCAL_SNAP" testnet-deploy@"$vmPublicIp":solana_local.snap
echo "Succeeded in ${SECONDS} seconds"
}
vm_foreach transfer_local_snap
fi
2018-07-12 19:47:07 -07:00
echo "--- Stopping client node(s)"
pids=()
vm_foreach_in_class client client_stop
client_stop_pids=("${pids[@]}")
if ! $ROLLING_UPDATE; then
pids=()
echo "--- Shutting down all full nodes"
vm_foreach_in_class leader fullnode_stop
vm_foreach_in_class validator fullnode_stop
wait_for_pids fullnode shutdown
fi
pids=()
echo --- Starting leader node
vm_foreach_in_class leader leader_start
wait_for_pids leader
pids=()
echo --- Starting validator nodes
vm_foreach_in_class validator validator_start
wait_for_pids validators
2018-07-12 20:08:35 -07:00
2018-07-16 09:42:54 -07:00
echo "--- $publicUrl sanity test"
2018-07-28 12:32:29 -07:00
if [[ -z $CI ]]; then
# TODO: ssh into a node and run testnet-sanity.sh there. It's not safe to
# assume the correct Snap is installed on the current non-CI machine
echo Skipped for non-CI deploy
snapVersion=unknown
2018-07-28 12:32:29 -07:00
else
(
set -x
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl $fullnode_count
)
IFS=\ read -r _ snapVersion _ < <(snap info solana | grep "^installed:")
2018-08-08 22:45:39 -07:00
snapVersion=${snapVersion/0+git./}
2018-07-28 12:32:29 -07:00
fi
pids=("${client_stop_pids[@]}")
wait_for_pids client shutdown
vm_foreach_in_class client client_start
# Add "network started" datapoint
$metrics_write_datapoint "testnet-deploy,name=$netBasename start=1,version=\"$snapVersion\""
2018-07-12 19:47:07 -07:00
exit 0