From 4e7e5ace9d3665a6f52bd63a05c692021d98c6f0 Mon Sep 17 00:00:00 2001 From: Dan Albert Date: Tue, 23 Apr 2019 16:41:45 -0600 Subject: [PATCH] Add support for Azure instances in testnet creation (#3905) * Add support for Azure instances in testnet creation * Fixup * Fix shellcheck errors * More shellcheck and cleanup node creation and deletion * More shellcheck and cleanup node creation and deletion * Fixup instance wait API * Fix revieew comments and add GPU installation extension --- net/README.md | 2 +- net/azure.sh | 1 + net/gce.sh | 23 ++- net/net.sh | 2 +- net/scripts/azure-provider.sh | 306 ++++++++++++++++++++++++++++++++++ net/scripts/ec2-provider.sh | 13 ++ net/scripts/gce-provider.sh | 13 ++ 7 files changed, 354 insertions(+), 6 deletions(-) create mode 120000 net/azure.sh create mode 100755 net/scripts/azure-provider.sh mode change 100644 => 100755 net/scripts/ec2-provider.sh mode change 100644 => 100755 net/scripts/gce-provider.sh diff --git a/net/README.md b/net/README.md index 5eea314ee..4a1cd3a99 100644 --- a/net/README.md +++ b/net/README.md @@ -32,7 +32,7 @@ NOTE: This example uses GCE. If you are using AWS EC2, replace `./gce.sh` with ```bash $ cd net/ $ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 additional nodes (beyond the bootstrap node) and 1 client (billing starts here) -$ ./init-metrics.sh $(whoami) #<-- Configure a metrics database for the testnet +$ ./init-metrics.sh -c $(whoami) #<-- Configure a metrics database for the testnet $ ./net.sh start #<-- Deploy the network from the local workspace and start all clients with bench-tps $ ./ssh.sh #<-- Details on how to ssh into any testnet node to access logs/etc $ ./gce.sh delete #<-- Dispose of the network (billing stops here) diff --git a/net/azure.sh b/net/azure.sh new file mode 120000 index 000000000..91afe231f --- /dev/null +++ b/net/azure.sh @@ -0,0 +1 @@ +gce.sh \ No newline at end of file diff --git a/net/gce.sh b/net/gce.sh index e1dae7837..4d5a07884 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -30,6 +30,18 @@ ec2) clientMachineType=m4.2xlarge blockstreamerMachineType=m4.2xlarge ;; +azure) + # shellcheck source=net/scripts/azure-provider.sh + source "$here"/scripts/azure-provider.sh + + # TODO: Dial in machine types for Azure + cpuBootstrapLeaderMachineType=Standard_D16s_v3 + gpuBootstrapLeaderMachineType=Standard_NC12 + bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType + fullNodeMachineType=$cpuBootstrapLeaderMachineType + clientMachineType=Standard_D16s_v3 + blockstreamerMachineType=Standard_D16s_v3 + ;; *) echo "Error: Unknown cloud provider: $cloudProvider" ;; @@ -191,6 +203,8 @@ gce) ;; ec2) ;; +azure) + ;; *) echo "Error: Unknown cloud provider: $cloudProvider" ;; @@ -202,10 +216,10 @@ esac # # cmd - The command to execute on each instance # The command will receive arguments followed by any -# additionl arguments supplied to cloud_ForEachInstance: +# additional arguments supplied to cloud_ForEachInstance: # name - name of the instance # publicIp - The public IP address of this instance -# privateIp - The priate IP address of this instance +# privateIp - The private IP address of this instance # count - Monotonically increasing count for each # invocation of cmd, starting at 1 # ... - Extra args to cmd.. @@ -293,8 +307,9 @@ EOF declare nodeZone IFS=: read -r nodeName nodeIp _ nodeZone < <(echo "${instances[0]}") - # Try to ping the machine first. - timeout 90s bash -c "set -o pipefail; until ping -c 3 $nodeIp | tr - _; do echo .; done" + # Make sure the machine is alive or pingable + timeout_sec=90 + cloud_WaitForInstanceReady "$nodeName" "$nodeIp" "$nodeZone" "$timeout_sec" if [[ ! -r $sshPrivateKey ]]; then echo "Fetching $sshPrivateKey from $nodeName" diff --git a/net/net.sh b/net/net.sh index 96fe6e95b..e36ebc31b 100755 --- a/net/net.sh +++ b/net/net.sh @@ -302,7 +302,7 @@ startNode() { " ) >> "$logFile" 2>&1 & declare pid=$! - ln -sf "fullnode-$ipAddress.log" "$netLogDir/fullnode-$pid.log" + ln -sfT "fullnode-$ipAddress.log" "$netLogDir/fullnode-$pid.log" pids+=("$pid") } diff --git a/net/scripts/azure-provider.sh b/net/scripts/azure-provider.sh new file mode 100755 index 000000000..09b078cf5 --- /dev/null +++ b/net/scripts/azure-provider.sh @@ -0,0 +1,306 @@ +# |source| this file +# +# Utilities for working with Azure instances +# + +# Default zone +cloud_DefaultZone() { + echo "westus" +} + +# +# __cloud_GetConfigValueFromInstanceName +# Return a piece of configuration information about an instance +# Provide the exact name of an instance and the configuration key, and the corresponding value will be returned +# +# example: +# This will return the name of the resource group of the instance named +# __cloud_GetConfigValueFromInstanceName some-instance-name resourceGroup + +cloud_GetConfigValueFromInstanceName() { + query="[?name=='$1']" + key="[$2]" + config_value=$(az vm list -d -o tsv --query "$query.$key") +} + +cloud_GetResourceGroupFromInstanceName() { + resourceGroup=$(az vm list -o tsv --query "[?name=='$1'].[resourceGroup]") +} +cloud_GetIdFromInstanceName() { + id=$(az vm list -o tsv --query "[?name=='$1'].[id]") +} + +# +# __cloud_FindInstances +# +# Find instances matching the specified pattern. +# +# For each matching instance, an entry in the `instances` array will be added with the +# following information about the instance: +# "name:public IP:private IP:location" +# +# filter - The instances to filter on +# +# examples: +# $ __cloud_FindInstances prefix some-machine-prefix +# $ __cloud_FindInstances name exact-machine-name +# +# Examples of plain-text filter command +# +# This will return an exact match for a machine named pgnode +# az vm list -d --query "[?name=='pgnode'].[name,publicIps,privateIps,location]" +# +# This will return a match for any machine with prefix pgnode, ex: pgnode and pgnode2 +# az vm list -d --query "[?starts_with(name,'pgnode')].[name,publicIps,privateIps,location]" +__cloud_FindInstances() { + case $1 in + prefix) + query="[?starts_with(name,'$2')]" + ;; + name) + query="[?name=='$2']" + ;; + *) + echo "Unknown filter command: $1" + ;; + esac + + keys="[name,publicIps,privateIps,location]" + + instances=() + while read -r name publicIp privateIp location; do + instances+=("$name:$publicIp:$privateIp:$location") + done < <(az vm list -d -o tsv --query "$query.$keys") + echo "${instances[*]}" +} + +# +# cloud_FindInstances [namePrefix] +# +# Find instances with names matching the specified prefix +# +# For each matching instance, an entry in the `instances` array will be added with the +# following information about the instance: +# "name:public IP:private IP:location" +# +# namePrefix - The instance name prefix to look for +# +# examples: +# $ cloud_FindInstances all-machines-with-a-common-machine-prefix +# +cloud_FindInstances() { + __cloud_FindInstances prefix "$1" +} + +# +# cloud_FindInstance [name] +# +# Find an instance with a name matching the exact pattern. +# +# For each matching instance, an entry in the `instances` array will be added with the +# following information about the instance: +# "name:public IP:private IP:location" +# +# name - The instance name to look for +# +# examples: +# $ cloud_FindInstance exact-machine-name +# +cloud_FindInstance() { + __cloud_FindInstances name "$1" +} + +# +# cloud_Initialize [networkName] +# +# Perform one-time initialization that may be required for the given testnet. +# +# networkName - unique name of this testnet +# +# This function will be called before |cloud_CreateInstances| +cloud_Initialize() { + declare networkName="$1" + # ec2-provider.sh creates firewall rules programmatically, should do the same + # here. + echo "TODO: create $networkName firewall rules programmatically instead of assuming the 'testnet' tag exists" +} + +# +# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName] +# [machineType] [bootDiskSize] [enableGpu] +# [startupScript] [address] +# +# Creates one more identical instances. +# +# networkName - unique name of this testnet +# namePrefix - unique string to prefix all the instance names with +# numNodes - number of instances to create +# imageName - Disk image for the instances +# machineType - GCE machine type. Note that this may also include an +# `--accelerator=` or other |gcloud compute instances create| +# options +# bootDiskSize - Optional size of the boot disk in GB +# enableGpu - Optionally enable GPU, use the value "true" to enable +# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80" +# startupScript - Optional startup script to execute when the instance boots +# address - Optional name of the GCE static IP address to attach to the +# instance. Requires that |numNodes| = 1 and that addressName +# has been provisioned in the GCE region that is hosting `$zone` +# +# Tip: use cloud_FindInstances to locate the instances once this function +# returns +cloud_CreateInstances() { + declare networkName="$1" + declare namePrefix="$2" + declare numNodes="$3" + declare enableGpu="$4" + declare machineType="$5" + declare zone="$6" + declare optionalBootDiskSize="$7" + declare optionalStartupScript="$8" + declare optionalAddress="$9" + declare optionalBootDiskType="${10}" + + declare -a nodes + if [[ $numNodes = 1 ]]; then + nodes=("$namePrefix") + else + for node in $(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes"); do + nodes+=("$node") + done + fi + + declare -a args + args=( + --resource-group "$networkName" + --tags testnet + --image UbuntuLTS + --size "$machineType" + --location "$zone" + --generate-ssh-keys + ) + + if [[ -n $optionalBootDiskSize ]]; then + args+=( + --os-disk-size-gb "$optionalBootDiskSize" + ) + fi + if [[ -n $optionalStartupScript ]]; then + args+=( + --custom-data "$optionalStartupScript" + ) + fi + + if [[ -n $optionalBootDiskType ]]; then + echo Boot disk type not configurable + fi + + if [[ -n $optionalAddress ]]; then + [[ $numNodes = 1 ]] || { + echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress" + exit 1 + } + args+=( + --public-ip-address "$optionalAddress" + ) + fi + + ( + set -x + # 1: Check if resource group exists. If not, create it. + numGroup=$(az group list --query "length([?name=='$networkName'])") + if [[ $numGroup -eq 0 ]]; then + echo Resource Group "$networkName" does not exist. Creating it now. + az group create --name "$networkName" --location "$zone" + else + echo Resource group "$networkName" already exists. + az group show --name "$networkName" + fi + + # 2: For node in numNodes, create VM and put the creation process in the background with --no-wait + for nodeName in "${nodes[@]}"; do + az vm create --name "$nodeName" "${args[@]}" --no-wait + done + + # 3: Wait until all nodes are created + for nodeName in "${nodes[@]}"; do + az vm wait --created --name "$nodeName" --resource-group "$networkName" + done + + # 4. If GPU is to be enabled, install the appropriate extension + if $enableGpu; then + for nodeName in "${nodes[@]}"; do + az vm extension set \ + --resource-group "$networkName" \ + --vm-name "$nodeName" \ + --name NvidiaGpuDriverLinux \ + --publisher Microsoft.HpcCompute \ + --version 1.2 \ + --no-wait + done + + # 5. Wait until all nodes have GPU extension installed + for nodeName in "${nodes[@]}"; do + az vm wait --updated --name "$nodeName" --resource-group "$networkName" + done + fi + ) +} + +# +# cloud_DeleteInstances +# +# Deletes all the instances listed in the `instances` array +# +cloud_DeleteInstances() { + if [[ ${#instances[0]} -eq 0 ]]; then + echo No instances to delete + return + fi + + declare names=("${instances[@]/:*/}") + ( + set -x + id_list=() + + # Build a space delimited list of all resource IDs to delete + for instance in "${names[@]}"; do + cloud_GetIdFromInstanceName "$instance" + id_list+=("$id") + done + + # Delete all instances in the id_list and return once they are all deleted + az vm delete --ids "${id_list[@]}" --yes --verbose + ) +} + +# +# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout] +# +# Return once the newly created VM instance is responding. This function is cloud-provider specific. +# +cloud_WaitForInstanceReady() { + declare instanceName="$1" +# declare instanceIp="$2" # unused +# declare instanceZone="$3" # unused + declare timeout="$4" + + cloud_GetResourceGroupFromInstanceName "$instanceName" + az vm wait -g "$resourceGroup" -n "$instanceName" --created --interval 10 --timeout "$timeout" +} + +# +# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile] +# +# Fetch a file from the given instance. This function uses a cloud-specific +# mechanism to fetch the file +# +cloud_FetchFile() { + declare instanceName="$1" + declare publicIp="$2" + declare remoteFile="$3" + declare localFile="$4" + + cloud_GetConfigValueFromInstanceName "$instanceName" osProfile.adminUsername + scp "${config_value}@${publicIp}:${remoteFile}" "$localFile" +} diff --git a/net/scripts/ec2-provider.sh b/net/scripts/ec2-provider.sh old mode 100644 new mode 100755 index 05169003a..9f2af9585 --- a/net/scripts/ec2-provider.sh +++ b/net/scripts/ec2-provider.sh @@ -340,6 +340,19 @@ cloud_DeleteInstances() { done } +# +# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout] +# +# Return once the newly created VM instance is responding. This function is cloud-provider specific. +# +cloud_WaitForInstanceReady() { + declare instanceName="$1" + declare instanceIp="$2" +# declare instanceZone="$3" # unused + declare timeout="$4" + + timeout "${timeout}"s bash -c "set -o pipefail; until ping -c 3 $instanceIp | tr - _; do echo .; done" +} # # cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile] diff --git a/net/scripts/gce-provider.sh b/net/scripts/gce-provider.sh old mode 100644 new mode 100755 index c81031f37..e4546c7af --- a/net/scripts/gce-provider.sh +++ b/net/scripts/gce-provider.sh @@ -215,6 +215,19 @@ cloud_DeleteInstances() { done } +# +# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout] +# +# Return once the newly created VM instance is responding. This function is cloud-provider specific. +# +cloud_WaitForInstanceReady() { + declare instanceName="$1" + declare instanceIp="$2" +# declare instanceZone="$3" + declare timeout="$4" + + timeout "${timeout}"s bash -c "set -o pipefail; until ping -c 3 $instanceIp | tr - _; do echo .; done" +} # # cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]