Add provision in testnet scripts to ignore validator nodes that failed to bootup (#3972)

* Skip writing to config file if the node didn't bootup
* Detect dead nodes quicker
This commit is contained in:
Pankaj Garg 2019-04-24 16:23:26 -07:00 committed by GitHub
parent 9a40ad76bd
commit 29a25990d3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 34 additions and 21 deletions

View File

@ -55,6 +55,7 @@ blockstreamer=false
fullNodeBootDiskSizeInGb=1000
clientBootDiskSizeInGb=75
externalNodes=false
failOnValidatorBootupFailure=true
publicNetwork=false
enableGpu=false
@ -95,6 +96,7 @@ Manage testnet instances
zone
-x - append to the existing configuration instead of creating a
new configuration
-f - Discard validator nodes that didn't bootup successfully
create-specific options:
-n [number] - Number of additional fullnodes (default: $additionalFullNodeCount)
@ -133,7 +135,7 @@ shift
[[ $command = create || $command = config || $command = info || $command = delete ]] ||
usage "Invalid command: $command"
while getopts "h?p:Pn:c:z:gG:a:d:bux" opt; do
while getopts "h?p:Pn:c:z:gG:a:d:buxf" opt; do
case $opt in
h | \?)
usage
@ -179,6 +181,9 @@ while getopts "h?p:Pn:c:z:gG:a:d:bux" opt; do
x)
externalNodes=true
;;
f)
failOnValidatorBootupFailure=false
;;
*)
usage "unhandled option: $opt"
;;
@ -263,23 +268,27 @@ EOF
declare publicIp="$2"
declare privateIp="$3"
declare arrayName="$5"
declare failOnFailure="$5"
declare arrayName="$6"
echo "$arrayName+=($publicIp) # $name" >> "$configFile"
echo "${arrayName}Private+=($privateIp) # $name" >> "$configFile"
}
waitForStartupComplete() {
declare name="$1"
declare publicIp="$2"
# This check should eventually be moved to cloud provider specific script
if [ "$publicIp" = "TERMINATED" ] || [ "$privateIp" = "TERMINATED" ]; then
if $failOnFailure; then
exit 1
else
return 0
fi
fi
ok=true
echo "Waiting for $name to finish booting..."
(
set -x +e
for i in $(seq 1 60); do
timeout 20s ssh "${sshOptions[@]}" "$publicIp" "ls -l /.instance-startup-complete"
for i in $(seq 1 20); do
timeout --preserve-status --foreground 20s ssh "${sshOptions[@]}" "$publicIp" "ls -l /.instance-startup-complete"
ret=$?
if [[ $ret -eq 0 ]]; then
echo "$name has booted."
exit 0
fi
sleep 2
@ -287,8 +296,16 @@ EOF
done
echo "$name failed to boot."
exit 1
)
echo "$name has booted."
) || ok=false
if ! $ok; then
if $failOnFailure; then
exit 1
fi
else
echo "$arrayName+=($publicIp) # $name" >> "$configFile"
echo "${arrayName}Private+=($privateIp) # $name" >> "$configFile"
fi
}
if $externalNodes; then
@ -333,8 +350,7 @@ EOF
echo "fullnodeIpList=()" >> "$configFile"
echo "fullnodeIpListPrivate=()" >> "$configFile"
cloud_ForEachInstance recordInstanceIp fullnodeIpList
cloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp true fullnodeIpList
fi
if [[ $additionalFullNodeCount -gt 0 ]]; then
@ -345,8 +361,7 @@ EOF
echo "Unable to find additional fullnodes"
exit 1
}
cloud_ForEachInstance recordInstanceIp fullnodeIpList
cloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
done
fi
@ -359,8 +374,7 @@ EOF
echo "Looking for client bencher instances..."
cloud_FindInstances "$prefix-client"
[[ ${#instances[@]} -eq 0 ]] || {
cloud_ForEachInstance recordInstanceIp clientIpList
cloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp true clientIpList
}
if $externalNodes; then
@ -372,8 +386,7 @@ EOF
echo "Looking for blockstreamer instances..."
cloud_FindInstances "$prefix-blockstreamer"
[[ ${#instances[@]} -eq 0 ]] || {
cloud_ForEachInstance recordInstanceIp blockstreamerIpList
cloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp true blockstreamerIpList
}
echo "Wrote $configFile"