Delete unreachable validators to cause a fresh one to be spawned
This commit is contained in:
parent
a7204d5353
commit
a9240a42bf
|
@ -73,6 +73,18 @@ leaderName=${publicUrl//./-}
|
||||||
|
|
||||||
vmlist=() # Each array element is formatted as "class:vmName:vmZone:vmPublicIp"
|
vmlist=() # Each array element is formatted as "class:vmName:vmZone:vmPublicIp"
|
||||||
|
|
||||||
|
vm_exec() {
|
||||||
|
declare vmName=$1
|
||||||
|
declare vmZone=$2
|
||||||
|
declare vmPublicIp=$3
|
||||||
|
declare message=$4
|
||||||
|
declare cmd=$5
|
||||||
|
|
||||||
|
echo "--- $message $vmName in zone $vmZone ($vmPublicIp)"
|
||||||
|
ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||||||
|
testnet-deploy@"$vmPublicIp" "$cmd"
|
||||||
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
# vm_foreach [cmd] [extra args to cmd]
|
# vm_foreach [cmd] [extra args to cmd]
|
||||||
# where
|
# where
|
||||||
|
@ -152,6 +164,53 @@ findVms() {
|
||||||
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,status)')
|
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,status)')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_pids() {
|
||||||
|
echo "--- Waiting for $*"
|
||||||
|
for pid in "${pids[@]}"; do
|
||||||
|
declare ok=true
|
||||||
|
wait "$pid" || ok=false
|
||||||
|
cat "log-$pid.txt"
|
||||||
|
if ! $ok; then
|
||||||
|
echo ^^^ +++
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
rm "log-$pid.txt"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
delete_unreachable_validators() {
|
||||||
|
declare vmName=$1
|
||||||
|
declare vmZone=$2
|
||||||
|
declare vmPublicIp=$3
|
||||||
|
|
||||||
|
touch "log-$vmName.txt"
|
||||||
|
(
|
||||||
|
SECONDS=0
|
||||||
|
if ! vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Checking $vmName" uptime; then
|
||||||
|
echo "^^^ +++"
|
||||||
|
|
||||||
|
# Validators are managed by a Compute Engine Instance Group, so deleting
|
||||||
|
# one will just cause a new one to be spawned.
|
||||||
|
echo "Warning: $vmName is unreachable, deleting it"
|
||||||
|
gcloud compute instances delete "$vmName" --zone "$vmZone"
|
||||||
|
fi
|
||||||
|
echo "Checked in ${SECONDS} seconds"
|
||||||
|
) >> "log-$vmName.txt" 2>&1 &
|
||||||
|
declare pid=$!
|
||||||
|
|
||||||
|
# Rename log file so it can be discovered later by $pid
|
||||||
|
mv "log-$vmName.txt" "log-$pid.txt"
|
||||||
|
pids+=("$pid")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
echo "Validator nodes (unverified):"
|
||||||
|
findVms validator "name~^$leaderName-validator-"
|
||||||
|
pids=()
|
||||||
|
vm_foreach_in_class validator delete_unreachable_validators
|
||||||
|
wait_for_pids validator sanity check
|
||||||
|
vmlist=()
|
||||||
|
|
||||||
echo "Leader node:"
|
echo "Leader node:"
|
||||||
findVms leader "name=$leaderName"
|
findVms leader "name=$leaderName"
|
||||||
[[ ${#vmlist[@]} = 1 ]] || {
|
[[ ${#vmlist[@]} = 1 ]] || {
|
||||||
|
@ -176,28 +235,13 @@ vm_foreach_in_class validator inc_fullnode_count
|
||||||
netName=${SOLANA_NET_URL/.*/}
|
netName=${SOLANA_NET_URL/.*/}
|
||||||
"$here"/metrics_write_datapoint.sh "testnet-deploy,name=$netName stop=1"
|
"$here"/metrics_write_datapoint.sh "testnet-deploy,name=$netName stop=1"
|
||||||
|
|
||||||
gcp_vm_exec() {
|
|
||||||
declare vmName=$1
|
|
||||||
declare vmZone=$2
|
|
||||||
declare vmPublicIp=$3
|
|
||||||
declare message=$4
|
|
||||||
declare cmd=$5
|
|
||||||
|
|
||||||
echo "--- $message $vmName in zone $vmZone ($vmPublicIp)"
|
|
||||||
(
|
|
||||||
set -x
|
|
||||||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
|
||||||
testnet-deploy@"$vmPublicIp" "$cmd"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
client_start() {
|
client_start() {
|
||||||
declare vmName=$1
|
declare vmName=$1
|
||||||
declare vmZone=$2
|
declare vmZone=$2
|
||||||
declare vmPublicIp=$3
|
declare vmPublicIp=$3
|
||||||
declare count=$4
|
declare count=$4
|
||||||
|
|
||||||
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
|
||||||
"Starting client $count:" \
|
"Starting client $count:" \
|
||||||
"\
|
"\
|
||||||
set -x;
|
set -x;
|
||||||
|
@ -227,9 +271,10 @@ client_stop() {
|
||||||
declare vmPublicIp=$3
|
declare vmPublicIp=$3
|
||||||
declare count=$4
|
declare count=$4
|
||||||
|
|
||||||
|
touch "log-$vmName.txt"
|
||||||
(
|
(
|
||||||
SECONDS=0
|
SECONDS=0
|
||||||
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
|
||||||
"Stopping client $vmName ($count):" \
|
"Stopping client $vmName ($count):" \
|
||||||
"\
|
"\
|
||||||
set -x;
|
set -x;
|
||||||
|
@ -243,13 +288,10 @@ client_stop() {
|
||||||
; \
|
; \
|
||||||
"
|
"
|
||||||
echo "Client stopped in ${SECONDS} seconds"
|
echo "Client stopped in ${SECONDS} seconds"
|
||||||
) > "log-$vmName.txt" 2>&1 &
|
) >> "log-$vmName.txt" 2>&1 &
|
||||||
declare pid=$!
|
declare pid=$!
|
||||||
|
|
||||||
# Rename log file so it can be discovered later by $pid
|
# Rename log file so it can be discovered later by $pid
|
||||||
while [[ ! -f "log-$vmName.txt" ]]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
mv "log-$vmName.txt" "log-$pid.txt"
|
mv "log-$vmName.txt" "log-$pid.txt"
|
||||||
pids+=("$pid")
|
pids+=("$pid")
|
||||||
}
|
}
|
||||||
|
@ -261,6 +303,7 @@ fullnode_start() {
|
||||||
declare vmPublicIp=$4
|
declare vmPublicIp=$4
|
||||||
declare count=$5
|
declare count=$5
|
||||||
|
|
||||||
|
touch "log-$vmName.txt"
|
||||||
(
|
(
|
||||||
SECONDS=0
|
SECONDS=0
|
||||||
commonNodeConfig="\
|
commonNodeConfig="\
|
||||||
|
@ -277,7 +320,7 @@ fullnode_start() {
|
||||||
nodeConfig="mode=validator leader-address=$publicIp $commonNodeConfig"
|
nodeConfig="mode=validator leader-address=$publicIp $commonNodeConfig"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \
|
||||||
"\
|
"\
|
||||||
set -ex; \
|
set -ex; \
|
||||||
logmarker='solana deploy $(date)/$RANDOM'; \
|
logmarker='solana deploy $(date)/$RANDOM'; \
|
||||||
|
@ -291,13 +334,10 @@ fullnode_start() {
|
||||||
sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \
|
sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \
|
||||||
"
|
"
|
||||||
echo "Succeeded in ${SECONDS} seconds"
|
echo "Succeeded in ${SECONDS} seconds"
|
||||||
) > "log-$vmName.txt" 2>&1 &
|
) >> "log-$vmName.txt" 2>&1 &
|
||||||
declare pid=$!
|
declare pid=$!
|
||||||
|
|
||||||
# Rename log file so it can be discovered later by $pid
|
# Rename log file so it can be discovered later by $pid
|
||||||
while [[ ! -f "log-$vmName.txt" ]]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
mv "log-$vmName.txt" "log-$pid.txt"
|
mv "log-$vmName.txt" "log-$pid.txt"
|
||||||
|
|
||||||
pids+=("$pid")
|
pids+=("$pid")
|
||||||
|
@ -317,39 +357,23 @@ fullnode_stop() {
|
||||||
declare vmPublicIp=$3
|
declare vmPublicIp=$3
|
||||||
declare count=$4
|
declare count=$4
|
||||||
|
|
||||||
|
touch "log-$vmName.txt"
|
||||||
(
|
(
|
||||||
SECONDS=0
|
SECONDS=0
|
||||||
gcp_vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\
|
||||||
if snap list solana; then \
|
if snap list solana; then \
|
||||||
sudo snap set solana mode=; \
|
sudo snap set solana mode=; \
|
||||||
fi"
|
fi"
|
||||||
echo "Succeeded in ${SECONDS} seconds"
|
echo "Succeeded in ${SECONDS} seconds"
|
||||||
) > "log-$vmName.txt" 2>&1 &
|
) >> "log-$vmName.txt" 2>&1 &
|
||||||
declare pid=$!
|
declare pid=$!
|
||||||
|
|
||||||
# Rename log file so it can be discovered later by $pid
|
# Rename log file so it can be discovered later by $pid
|
||||||
while [[ ! -f "log-$vmName.txt" ]]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
mv "log-$vmName.txt" "log-$pid.txt"
|
mv "log-$vmName.txt" "log-$pid.txt"
|
||||||
|
|
||||||
pids+=("$pid")
|
pids+=("$pid")
|
||||||
}
|
}
|
||||||
|
|
||||||
wait_for_pids() {
|
|
||||||
echo "--- Waiting for $*"
|
|
||||||
for pid in "${pids[@]}"; do
|
|
||||||
declare ok=true
|
|
||||||
wait "$pid" || ok=false
|
|
||||||
cat "log-$pid.txt"
|
|
||||||
if ! $ok; then
|
|
||||||
echo ^^^ +++
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
rm "log-$pid.txt"
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ -n $LOCAL_SNAP ]]; then
|
if [[ -n $LOCAL_SNAP ]]; then
|
||||||
echo "--- Transferring $LOCAL_SNAP to node(s)"
|
echo "--- Transferring $LOCAL_SNAP to node(s)"
|
||||||
|
|
||||||
|
@ -369,8 +393,8 @@ if [[ -n $LOCAL_SNAP ]]; then
|
||||||
vm_foreach transfer_local_snap
|
vm_foreach transfer_local_snap
|
||||||
fi
|
fi
|
||||||
|
|
||||||
pids=()
|
|
||||||
echo "--- Stopping client node(s)"
|
echo "--- Stopping client node(s)"
|
||||||
|
pids=()
|
||||||
vm_foreach_in_class client client_stop
|
vm_foreach_in_class client client_stop
|
||||||
client_stop_pids=("${pids[@]}")
|
client_stop_pids=("${pids[@]}")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue