Add support for preemptible GCP instances

This commit is contained in:
Michael Vines 2019-10-14 23:11:29 -07:00
parent 60141e0c2c
commit 9267931ef6
6 changed files with 86 additions and 5 deletions

View File

@ -78,6 +78,7 @@ replicatorBootDiskSizeInGb=500
fullNodeAdditionalDiskSizeInGb=
externalNodes=false
failOnValidatorBootupFailure=true
preemptible=true
publicNetwork=false
letsEncryptDomainName=
@ -146,6 +147,11 @@ Manage testnet instances
- Add an additional [number] GB SSD to all fullnodes to store the config directory.
If not set, config will be written to the boot disk by default.
Only supported on GCE.
--dedicated - Use dedicated instances for additional full nodes
(by default preemptible instances are used to reduce
cost). Note that the bootstrap leader, replicator,
blockstreamer and client nodes are always dedicated.
config-specific options:
-P - Use public network IP addresses (default: $publicNetwork)
@ -180,6 +186,9 @@ while [[ -n $1 ]]; do
elif [[ $1 == --allow-boot-failures ]]; then
failOnValidatorBootupFailure=false
shift
elif [[ $1 == --dedicated ]]; then
preemptible=false
shift
else
usage "Unknown long option: $1"
fi
@ -378,6 +387,8 @@ EOF
buildSshOptions
cloud_RestartPreemptedInstances "$prefix"
fetchPrivateKey() {
declare nodeName
declare nodeIp
@ -725,7 +736,7 @@ EOF
cloud_CreateInstances "$prefix" "$prefix-bootstrap-leader" 1 \
"$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$fullNodeBootDiskSizeInGb" \
"$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \
"$sshPrivateKey"
"never preemptible" "$sshPrivateKey"
fi
if [[ $additionalFullNodeCount -gt 0 ]]; then
@ -746,7 +757,7 @@ EOF
cloud_CreateInstances "$prefix" "$prefix-$zone-fullnode" "$numNodesPerZone" \
"$enableGpu" "$fullNodeMachineType" "$zone" "$fullNodeBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \
"$sshPrivateKey" &
"$preemptible" "$sshPrivateKey" &
done
wait
@ -755,7 +766,7 @@ EOF
if [[ $clientNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType" "" "$sshPrivateKey"
"$startupScript" "" "$bootDiskType" "" "never preemptible" "$sshPrivateKey"
fi
if $blockstreamer; then
@ -767,7 +778,7 @@ EOF
if [[ $replicatorNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-replicator" "$replicatorNodeCount" \
false "$replicatorMachineType" "${zones[0]}" "$replicatorBootDiskSizeInGb" \
"$startupScript" "" "" "" "$sshPrivateKey"
"$startupScript" "" "" "" "never preemptible" "$sshPrivateKey"
fi
$metricsWriteDatapoint "testnet-deploy net-create-complete=1"

View File

@ -840,6 +840,32 @@ stop() {
echo "Stopping nodes took $SECONDS seconds"
}
checkPremptibleInstances() {
# The fullnodeIpList nodes may be preemptible instances that can disappear at
# any time. Try to detect when a fullnode has been preempted to help the user
# out.
#
# Of course this isn't airtight as an instance could always disappear
# immediately after its successfully pinged.
for ipAddress in "${fullnodeIpList[@]}"; do
(
set -x
ping -o -t 4 "$ipAddress"
) || {
cat <<EOF
Warning: $ipAddress may have been preempted.
Run |./gce.sh config| to restart it
EOF
exit 1
}
done
}
checkPremptibleInstances
case $command in
restart)
prepare_deploy

View File

@ -8,6 +8,10 @@ cloud_DefaultZone() {
echo "westus"
}
cloud_RestartPreemptedInstances() {
: # Not implemented
}
#
# __cloud_GetConfigValueFromInstanceName
# Return a piece of configuration information about an instance

View File

@ -16,6 +16,10 @@ cloud_DefaultZone() {
echo "Denver"
}
cloud_RestartPreemptedInstances() {
: # Not implemented
}
#
# __cloud_FindInstances
#
@ -134,6 +138,7 @@ cloud_Initialize() {
# has been provisioned in the GCE region that is hosting `$zone`
# bootDiskType - Optional specify SSD or HDD boot disk
# additionalDiskSize - Optional specify size of additional storage volume
# preemptible - Optionally request a preemptible instance ("true")
#
# Tip: use cloud_FindInstances to locate the instances once this function
# returns
@ -149,7 +154,8 @@ cloud_CreateInstances() {
#declare optionalAddress="$9" # unused
#declare optionalBootDiskType="${10}" # unused
#declare optionalAdditionalDiskSize="${11}" # unused
declare sshPrivateKey="${12}"
#declare optionalPreemptible="${12}" # unused
declare sshPrivateKey="${13}"
declare -a nodes
if [[ $numNodes = 1 ]]; then

View File

@ -7,6 +7,10 @@ cloud_DefaultZone() {
echo "us-east-1b"
}
cloud_RestartPreemptedInstances() {
: # Not implemented
}
# AWS region is zone with the last character removed
__cloud_GetRegion() {
declare zone="$1"

View File

@ -8,6 +8,29 @@ cloud_DefaultZone() {
echo "us-west1-b"
}
#
# cloud_RestartPreemptedInstances [namePrefix]
#
# Restart any preempted instances matching the specified prefix
#
# namePrefix - The instance name prefix of the preempted instances
#
cloud_RestartPreemptedInstances() {
declare filter="$1"
declare name status zone
while read -r name status zone; do
echo "Starting $status instance: $name"
(
set -x
gcloud compute instances start --zone "$zone" "$name"
)
done < <(gcloud compute instances list \
--filter "$filter" \
--format 'value(name,status,zone)' \
| grep TERMINATED)
}
#
# __cloud_FindInstances
#
@ -125,6 +148,7 @@ cloud_Initialize() {
# has been provisioned in the GCE region that is hosting `$zone`
# bootDiskType - Optional specify SSD or HDD boot disk
# additionalDiskSize - Optional specify size of additional storage volume
# preemptible - Optionally request a preemptible instance ("true")
#
# Tip: use cloud_FindInstances to locate the instances once this function
# returns
@ -140,6 +164,8 @@ cloud_CreateInstances() {
declare optionalAddress="$9"
declare optionalBootDiskType="${10}"
declare optionalAdditionalDiskSize="${11}"
declare optionalPreemptible="${12}"
#declare sshPrivateKey="${13}" # unused
if $enableGpu; then
# Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed
@ -176,6 +202,10 @@ cloud_CreateInstances() {
# shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args
args+=(--image $imageName)
if [[ $optionalPreemptible = true ]]; then
args+=(--preemptible)
fi
# shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
for word in $machineType; do
# Special handling for the "--min-cpu-platform" argument which may contain a