Add support for preemptible GCP instances

This commit is contained in:
Michael Vines 2019-10-14 23:11:29 -07:00
parent 60141e0c2c
commit 9267931ef6
6 changed files with 86 additions and 5 deletions

View File

@ -78,6 +78,7 @@ replicatorBootDiskSizeInGb=500
fullNodeAdditionalDiskSizeInGb= fullNodeAdditionalDiskSizeInGb=
externalNodes=false externalNodes=false
failOnValidatorBootupFailure=true failOnValidatorBootupFailure=true
preemptible=true
publicNetwork=false publicNetwork=false
letsEncryptDomainName= letsEncryptDomainName=
@ -146,6 +147,11 @@ Manage testnet instances
- Add an additional [number] GB SSD to all fullnodes to store the config directory. - Add an additional [number] GB SSD to all fullnodes to store the config directory.
If not set, config will be written to the boot disk by default. If not set, config will be written to the boot disk by default.
Only supported on GCE. Only supported on GCE.
--dedicated - Use dedicated instances for additional full nodes
(by default preemptible instances are used to reduce
cost). Note that the bootstrap leader, replicator,
blockstreamer and client nodes are always dedicated.
config-specific options: config-specific options:
-P - Use public network IP addresses (default: $publicNetwork) -P - Use public network IP addresses (default: $publicNetwork)
@ -180,6 +186,9 @@ while [[ -n $1 ]]; do
elif [[ $1 == --allow-boot-failures ]]; then elif [[ $1 == --allow-boot-failures ]]; then
failOnValidatorBootupFailure=false failOnValidatorBootupFailure=false
shift shift
elif [[ $1 == --dedicated ]]; then
preemptible=false
shift
else else
usage "Unknown long option: $1" usage "Unknown long option: $1"
fi fi
@ -378,6 +387,8 @@ EOF
buildSshOptions buildSshOptions
cloud_RestartPreemptedInstances "$prefix"
fetchPrivateKey() { fetchPrivateKey() {
declare nodeName declare nodeName
declare nodeIp declare nodeIp
@ -725,7 +736,7 @@ EOF
cloud_CreateInstances "$prefix" "$prefix-bootstrap-leader" 1 \ cloud_CreateInstances "$prefix" "$prefix-bootstrap-leader" 1 \
"$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$fullNodeBootDiskSizeInGb" \ "$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$fullNodeBootDiskSizeInGb" \
"$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \ "$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \
"$sshPrivateKey" "never preemptible" "$sshPrivateKey"
fi fi
if [[ $additionalFullNodeCount -gt 0 ]]; then if [[ $additionalFullNodeCount -gt 0 ]]; then
@ -746,7 +757,7 @@ EOF
cloud_CreateInstances "$prefix" "$prefix-$zone-fullnode" "$numNodesPerZone" \ cloud_CreateInstances "$prefix" "$prefix-$zone-fullnode" "$numNodesPerZone" \
"$enableGpu" "$fullNodeMachineType" "$zone" "$fullNodeBootDiskSizeInGb" \ "$enableGpu" "$fullNodeMachineType" "$zone" "$fullNodeBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \ "$startupScript" "" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \
"$sshPrivateKey" & "$preemptible" "$sshPrivateKey" &
done done
wait wait
@ -755,7 +766,7 @@ EOF
if [[ $clientNodeCount -gt 0 ]]; then if [[ $clientNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \ cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \ "$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType" "" "$sshPrivateKey" "$startupScript" "" "$bootDiskType" "" "never preemptible" "$sshPrivateKey"
fi fi
if $blockstreamer; then if $blockstreamer; then
@ -767,7 +778,7 @@ EOF
if [[ $replicatorNodeCount -gt 0 ]]; then if [[ $replicatorNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-replicator" "$replicatorNodeCount" \ cloud_CreateInstances "$prefix" "$prefix-replicator" "$replicatorNodeCount" \
false "$replicatorMachineType" "${zones[0]}" "$replicatorBootDiskSizeInGb" \ false "$replicatorMachineType" "${zones[0]}" "$replicatorBootDiskSizeInGb" \
"$startupScript" "" "" "" "$sshPrivateKey" "$startupScript" "" "" "" "never preemptible" "$sshPrivateKey"
fi fi
$metricsWriteDatapoint "testnet-deploy net-create-complete=1" $metricsWriteDatapoint "testnet-deploy net-create-complete=1"

View File

@ -840,6 +840,32 @@ stop() {
echo "Stopping nodes took $SECONDS seconds" echo "Stopping nodes took $SECONDS seconds"
} }
checkPremptibleInstances() {
# The fullnodeIpList nodes may be preemptible instances that can disappear at
# any time. Try to detect when a fullnode has been preempted to help the user
# out.
#
# Of course this isn't airtight as an instance could always disappear
# immediately after its successfully pinged.
for ipAddress in "${fullnodeIpList[@]}"; do
(
set -x
ping -o -t 4 "$ipAddress"
) || {
cat <<EOF
Warning: $ipAddress may have been preempted.
Run |./gce.sh config| to restart it
EOF
exit 1
}
done
}
checkPremptibleInstances
case $command in case $command in
restart) restart)
prepare_deploy prepare_deploy

View File

@ -8,6 +8,10 @@ cloud_DefaultZone() {
echo "westus" echo "westus"
} }
cloud_RestartPreemptedInstances() {
: # Not implemented
}
# #
# __cloud_GetConfigValueFromInstanceName # __cloud_GetConfigValueFromInstanceName
# Return a piece of configuration information about an instance # Return a piece of configuration information about an instance

View File

@ -16,6 +16,10 @@ cloud_DefaultZone() {
echo "Denver" echo "Denver"
} }
cloud_RestartPreemptedInstances() {
: # Not implemented
}
# #
# __cloud_FindInstances # __cloud_FindInstances
# #
@ -134,6 +138,7 @@ cloud_Initialize() {
# has been provisioned in the GCE region that is hosting `$zone` # has been provisioned in the GCE region that is hosting `$zone`
# bootDiskType - Optional specify SSD or HDD boot disk # bootDiskType - Optional specify SSD or HDD boot disk
# additionalDiskSize - Optional specify size of additional storage volume # additionalDiskSize - Optional specify size of additional storage volume
# preemptible - Optionally request a preemptible instance ("true")
# #
# Tip: use cloud_FindInstances to locate the instances once this function # Tip: use cloud_FindInstances to locate the instances once this function
# returns # returns
@ -149,7 +154,8 @@ cloud_CreateInstances() {
#declare optionalAddress="$9" # unused #declare optionalAddress="$9" # unused
#declare optionalBootDiskType="${10}" # unused #declare optionalBootDiskType="${10}" # unused
#declare optionalAdditionalDiskSize="${11}" # unused #declare optionalAdditionalDiskSize="${11}" # unused
declare sshPrivateKey="${12}" #declare optionalPreemptible="${12}" # unused
declare sshPrivateKey="${13}"
declare -a nodes declare -a nodes
if [[ $numNodes = 1 ]]; then if [[ $numNodes = 1 ]]; then

View File

@ -7,6 +7,10 @@ cloud_DefaultZone() {
echo "us-east-1b" echo "us-east-1b"
} }
cloud_RestartPreemptedInstances() {
: # Not implemented
}
# AWS region is zone with the last character removed # AWS region is zone with the last character removed
__cloud_GetRegion() { __cloud_GetRegion() {
declare zone="$1" declare zone="$1"

View File

@ -8,6 +8,29 @@ cloud_DefaultZone() {
echo "us-west1-b" echo "us-west1-b"
} }
#
# cloud_RestartPreemptedInstances [namePrefix]
#
# Restart any preempted instances matching the specified prefix
#
# namePrefix - The instance name prefix of the preempted instances
#
cloud_RestartPreemptedInstances() {
declare filter="$1"
declare name status zone
while read -r name status zone; do
echo "Starting $status instance: $name"
(
set -x
gcloud compute instances start --zone "$zone" "$name"
)
done < <(gcloud compute instances list \
--filter "$filter" \
--format 'value(name,status,zone)' \
| grep TERMINATED)
}
# #
# __cloud_FindInstances # __cloud_FindInstances
# #
@ -125,6 +148,7 @@ cloud_Initialize() {
# has been provisioned in the GCE region that is hosting `$zone` # has been provisioned in the GCE region that is hosting `$zone`
# bootDiskType - Optional specify SSD or HDD boot disk # bootDiskType - Optional specify SSD or HDD boot disk
# additionalDiskSize - Optional specify size of additional storage volume # additionalDiskSize - Optional specify size of additional storage volume
# preemptible - Optionally request a preemptible instance ("true")
# #
# Tip: use cloud_FindInstances to locate the instances once this function # Tip: use cloud_FindInstances to locate the instances once this function
# returns # returns
@ -140,6 +164,8 @@ cloud_CreateInstances() {
declare optionalAddress="$9" declare optionalAddress="$9"
declare optionalBootDiskType="${10}" declare optionalBootDiskType="${10}"
declare optionalAdditionalDiskSize="${11}" declare optionalAdditionalDiskSize="${11}"
declare optionalPreemptible="${12}"
#declare sshPrivateKey="${13}" # unused
if $enableGpu; then if $enableGpu; then
# Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed # Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed
@ -176,6 +202,10 @@ cloud_CreateInstances() {
# shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args # shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args
args+=(--image $imageName) args+=(--image $imageName)
if [[ $optionalPreemptible = true ]]; then
args+=(--preemptible)
fi
# shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args # shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
for word in $machineType; do for word in $machineType; do
# Special handling for the "--min-cpu-platform" argument which may contain a # Special handling for the "--min-cpu-platform" argument which may contain a