Delete unreachable validators to cause a fresh one to be spawned

This commit is contained in:
Michael Vines 2018-08-02 20:39:19 -07:00
parent a7204d5353
commit a9240a42bf
1 changed files with 70 additions and 46 deletions

View File

@ -73,6 +73,18 @@ leaderName=${publicUrl//./-}
vmlist=() # Each array element is formatted as "class:vmName:vmZone:vmPublicIp"
vm_exec() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare message=$4
declare cmd=$5
echo "--- $message $vmName in zone $vmZone ($vmPublicIp)"
ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
testnet-deploy@"$vmPublicIp" "$cmd"
}
#
# vm_foreach [cmd] [extra args to cmd]
# where
@ -152,6 +164,53 @@ findVms() {
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,status)')
}
wait_for_pids() {
echo "--- Waiting for $*"
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
cat "log-$pid.txt"
if ! $ok; then
echo ^^^ +++
exit 1
fi
rm "log-$pid.txt"
done
}
delete_unreachable_validators() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
touch "log-$vmName.txt"
(
SECONDS=0
if ! vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Checking $vmName" uptime; then
echo "^^^ +++"
# Validators are managed by a Compute Engine Instance Group, so deleting
# one will just cause a new one to be spawned.
echo "Warning: $vmName is unreachable, deleting it"
gcloud compute instances delete "$vmName" --zone "$vmZone"
fi
echo "Checked in ${SECONDS} seconds"
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
echo "Validator nodes (unverified):"
findVms validator "name~^$leaderName-validator-"
pids=()
vm_foreach_in_class validator delete_unreachable_validators
wait_for_pids validator sanity check
vmlist=()
echo "Leader node:"
findVms leader "name=$leaderName"
[[ ${#vmlist[@]} = 1 ]] || {
@ -176,28 +235,13 @@ vm_foreach_in_class validator inc_fullnode_count
netName=${SOLANA_NET_URL/.*/}
"$here"/metrics_write_datapoint.sh "testnet-deploy,name=$netName stop=1"
gcp_vm_exec() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare message=$4
declare cmd=$5
echo "--- $message $vmName in zone $vmZone ($vmPublicIp)"
(
set -x
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
testnet-deploy@"$vmPublicIp" "$cmd"
)
}
client_start() {
declare vmName=$1
declare vmZone=$2
declare vmPublicIp=$3
declare count=$4
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
"Starting client $count:" \
"\
set -x;
@ -227,9 +271,10 @@ client_stop() {
declare vmPublicIp=$3
declare count=$4
touch "log-$vmName.txt"
(
SECONDS=0
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
"Stopping client $vmName ($count):" \
"\
set -x;
@ -243,13 +288,10 @@ client_stop() {
; \
"
echo "Client stopped in ${SECONDS} seconds"
) > "log-$vmName.txt" 2>&1 &
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
while [[ ! -f "log-$vmName.txt" ]]; do
sleep 1
done
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
@ -261,6 +303,7 @@ fullnode_start() {
declare vmPublicIp=$4
declare count=$5
touch "log-$vmName.txt"
(
SECONDS=0
commonNodeConfig="\
@ -277,7 +320,7 @@ fullnode_start() {
nodeConfig="mode=validator leader-address=$publicIp $commonNodeConfig"
fi
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \
"\
set -ex; \
logmarker='solana deploy $(date)/$RANDOM'; \
@ -291,13 +334,10 @@ fullnode_start() {
sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \
"
echo "Succeeded in ${SECONDS} seconds"
) > "log-$vmName.txt" 2>&1 &
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
while [[ ! -f "log-$vmName.txt" ]]; do
sleep 1
done
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
@ -317,39 +357,23 @@ fullnode_stop() {
declare vmPublicIp=$3
declare count=$4
touch "log-$vmName.txt"
(
SECONDS=0
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\
if snap list solana; then \
sudo snap set solana mode=; \
fi"
echo "Succeeded in ${SECONDS} seconds"
) > "log-$vmName.txt" 2>&1 &
) >> "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
while [[ ! -f "log-$vmName.txt" ]]; do
sleep 1
done
mv "log-$vmName.txt" "log-$pid.txt"
pids+=("$pid")
}
wait_for_pids() {
echo "--- Waiting for $*"
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
cat "log-$pid.txt"
if ! $ok; then
echo ^^^ +++
exit 1
fi
rm "log-$pid.txt"
done
}
if [[ -n $LOCAL_SNAP ]]; then
echo "--- Transferring $LOCAL_SNAP to node(s)"
@ -369,8 +393,8 @@ if [[ -n $LOCAL_SNAP ]]; then
vm_foreach transfer_local_snap
fi
pids=()
echo "--- Stopping client node(s)"
pids=()
vm_foreach_in_class client client_stop
client_stop_pids=("${pids[@]}")