From 9267931ef6d0f31bcf3827e897cacc5274610ad4 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Mon, 14 Oct 2019 23:11:29 -0700 Subject: [PATCH] Add support for preemptible GCP instances --- net/gce.sh | 19 +++++++++++++++---- net/net.sh | 26 ++++++++++++++++++++++++++ net/scripts/azure-provider.sh | 4 ++++ net/scripts/colo-provider.sh | 8 +++++++- net/scripts/ec2-provider.sh | 4 ++++ net/scripts/gce-provider.sh | 30 ++++++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 5 deletions(-) diff --git a/net/gce.sh b/net/gce.sh index cc7721872..0879b91c1 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -78,6 +78,7 @@ replicatorBootDiskSizeInGb=500 fullNodeAdditionalDiskSizeInGb= externalNodes=false failOnValidatorBootupFailure=true +preemptible=true publicNetwork=false letsEncryptDomainName= @@ -146,6 +147,11 @@ Manage testnet instances - Add an additional [number] GB SSD to all fullnodes to store the config directory. If not set, config will be written to the boot disk by default. Only supported on GCE. + --dedicated - Use dedicated instances for additional full nodes + (by default preemptible instances are used to reduce + cost). Note that the bootstrap leader, replicator, + blockstreamer and client nodes are always dedicated. + config-specific options: -P - Use public network IP addresses (default: $publicNetwork) @@ -180,6 +186,9 @@ while [[ -n $1 ]]; do elif [[ $1 == --allow-boot-failures ]]; then failOnValidatorBootupFailure=false shift + elif [[ $1 == --dedicated ]]; then + preemptible=false + shift else usage "Unknown long option: $1" fi @@ -378,6 +387,8 @@ EOF buildSshOptions + cloud_RestartPreemptedInstances "$prefix" + fetchPrivateKey() { declare nodeName declare nodeIp @@ -725,7 +736,7 @@ EOF cloud_CreateInstances "$prefix" "$prefix-bootstrap-leader" 1 \ "$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$fullNodeBootDiskSizeInGb" \ "$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \ - "$sshPrivateKey" + "never preemptible" "$sshPrivateKey" fi if [[ $additionalFullNodeCount -gt 0 ]]; then @@ -746,7 +757,7 @@ EOF cloud_CreateInstances "$prefix" "$prefix-$zone-fullnode" "$numNodesPerZone" \ "$enableGpu" "$fullNodeMachineType" "$zone" "$fullNodeBootDiskSizeInGb" \ "$startupScript" "" "$bootDiskType" "$fullNodeAdditionalDiskSizeInGb" \ - "$sshPrivateKey" & + "$preemptible" "$sshPrivateKey" & done wait @@ -755,7 +766,7 @@ EOF if [[ $clientNodeCount -gt 0 ]]; then cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \ "$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \ - "$startupScript" "" "$bootDiskType" "" "$sshPrivateKey" + "$startupScript" "" "$bootDiskType" "" "never preemptible" "$sshPrivateKey" fi if $blockstreamer; then @@ -767,7 +778,7 @@ EOF if [[ $replicatorNodeCount -gt 0 ]]; then cloud_CreateInstances "$prefix" "$prefix-replicator" "$replicatorNodeCount" \ false "$replicatorMachineType" "${zones[0]}" "$replicatorBootDiskSizeInGb" \ - "$startupScript" "" "" "" "$sshPrivateKey" + "$startupScript" "" "" "" "never preemptible" "$sshPrivateKey" fi $metricsWriteDatapoint "testnet-deploy net-create-complete=1" diff --git a/net/net.sh b/net/net.sh index 79484d3e5..90a783d6a 100755 --- a/net/net.sh +++ b/net/net.sh @@ -840,6 +840,32 @@ stop() { echo "Stopping nodes took $SECONDS seconds" } + +checkPremptibleInstances() { + # The fullnodeIpList nodes may be preemptible instances that can disappear at + # any time. Try to detect when a fullnode has been preempted to help the user + # out. + # + # Of course this isn't airtight as an instance could always disappear + # immediately after its successfully pinged. + for ipAddress in "${fullnodeIpList[@]}"; do + ( + set -x + ping -o -t 4 "$ipAddress" + ) || { + cat <