From 155ee8792f16395f2b3f34608756b67f7f852218 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Mon, 17 Sep 2018 08:25:10 -0700 Subject: [PATCH] Add GPU support to ec2-provider --- net/gce.sh | 19 ++++++++++++------- net/scripts/ec2-provider.sh | 28 +++++++++++++--------------- net/scripts/gce-provider.sh | 36 +++++++++++++++++------------------- 3 files changed, 42 insertions(+), 41 deletions(-) diff --git a/net/gce.sh b/net/gce.sh index 37b1d72a40..51d096ae67 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -11,7 +11,9 @@ gce) source "$here"/scripts/gce-provider.sh imageName="ubuntu-16-04-cuda-9-2-new" - leaderMachineType=n1-standard-16 + cpuLeaderMachineType=n1-standard-16 + gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80" + leaderMachineType=$cpuLeaderMachineType validatorMachineType=n1-standard-4 clientMachineType=n1-standard-16 ;; @@ -19,8 +21,10 @@ ec2) # shellcheck source=net/scripts/ec2-provider.sh source "$here"/scripts/ec2-provider.sh - imageName="ami-04169656fea786776" - leaderMachineType=m4.4xlarge + imageName="ami-0466e26ccc0e752c1" + cpuLeaderMachineType=m4.4xlarge + gpuLeaderMachineType=p2.xlarge + leaderMachineType=$cpuLeaderMachineType validatorMachineType=m4.xlarge clientMachineType=m4.4xlarge ;; @@ -35,7 +39,7 @@ validatorNodeCount=5 clientNodeCount=1 leaderBootDiskSizeInGb=1000 validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb -clientBootDiskSizeInGb=40 +clientBootDiskSizeInGb=75 publicNetwork=false enableGpu=false @@ -111,6 +115,7 @@ while getopts "h?p:Pn:c:z:ga:" opt; do ;; g) enableGpu=true + leaderMachineType="$gpuLeaderMachineType" ;; a) leaderAddress=$OPTARG @@ -372,16 +377,16 @@ touch /.instance-startup-complete EOF cloud_CreateInstances "$prefix" "$prefix-leader" 1 \ - "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \ + "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \ "$startupScript" "$leaderAddress" cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \ - "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \ + "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \ "$startupScript" "" if [[ $clientNodeCount -gt 0 ]]; then cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \ - "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \ + "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \ "$startupScript" "" fi diff --git a/net/scripts/ec2-provider.sh b/net/scripts/ec2-provider.sh index 57700f3581..316501f777 100644 --- a/net/scripts/ec2-provider.sh +++ b/net/scripts/ec2-provider.sh @@ -104,8 +104,7 @@ cloud_FindInstance() { # # cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName] -# [machineType] [bootDiskSize] [enableGpu] -# [startupScript] [address] +# [machineType] [bootDiskSize] [startupScript] [address] # # Creates one more identical instances. # @@ -115,8 +114,6 @@ cloud_FindInstance() { # imageName - Disk image for the instances # machineType - GCE machine type # bootDiskSize - Optional size of the boot disk in GB -# enableGpu - Optionally enable GPU, use the value "true" to enable -# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80" # startupScript - Optional startup script to execute when the instance boots # address - Optional name of the GCE static IP address to attach to the # instance. Requires that |numNodes| = 1 and that addressName @@ -131,9 +128,8 @@ cloud_CreateInstances() { declare imageName="$4" declare machineType="$5" declare optionalBootDiskSize="$6" - declare optionalGpu="$7" - declare optionalStartupScript="$8" - declare optionalAddress="$9" + declare optionalStartupScript="$7" + declare optionalAddress="$8" __cloud_SshPrivateKeyCheck ( @@ -159,10 +155,6 @@ cloud_CreateInstances() { --block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]" ) fi - if [[ $optionalGpu = true ]]; then - echo TODO: GPU support not implemented yet - exit 1 - fi if [[ -n $optionalStartupScript ]]; then args+=( --user-data "file://$optionalStartupScript" @@ -189,10 +181,16 @@ cloud_CreateInstances() { declare instanceId IFS=: read -r instanceId _ < <(echo "${instances[0]}") - aws ec2 associate-address \ - --instance-id "$instanceId" \ - --region "region" \ - --allocation-id "$optionalAddress" + ( + set -x + # TODO: Poll that the instance has moved to the 'running' state instead of + # blindly sleeping for 30 seconds... + sleep 30 + aws ec2 associate-address \ + --instance-id "$instanceId" \ + --region "$region" \ + --allocation-id "$optionalAddress" + ) fi } diff --git a/net/scripts/gce-provider.sh b/net/scripts/gce-provider.sh index b52ea81e90..ea22d902b4 100644 --- a/net/scripts/gce-provider.sh +++ b/net/scripts/gce-provider.sh @@ -39,7 +39,7 @@ __cloud_FindInstances() { instances+=("$name:$publicIp:$privateIp") done < <(gcloud compute instances list \ - --filter="$filter" \ + --filter "$filter" \ --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)') } # @@ -91,7 +91,9 @@ cloud_FindInstance() { # namePrefix - unique string to prefix all the instance names with # numNodes - number of instances to create # imageName - Disk image for the instances -# machineType - GCE machine type +# machineType - GCE machine type. Note that this may also include an +# `--accelerator=` or other |gcloud compute instances create| +# options # bootDiskSize - Optional size of the boot disk in GB # enableGpu - Optionally enable GPU, use the value "true" to enable # eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80" @@ -109,9 +111,8 @@ cloud_CreateInstances() { declare imageName="$4" declare machineType="$5" declare optionalBootDiskSize="$6" - declare optionalGpu="$7" - declare optionalStartupScript="$8" - declare optionalAddress="$9" + declare optionalStartupScript="$7" + declare optionalAddress="$8" declare nodes if [[ $numNodes = 1 ]]; then @@ -122,22 +123,19 @@ cloud_CreateInstances() { declare -a args args=( - "--zone=$zone" - "--tags=testnet" - "--metadata=testnet=$networkName" - "--image=$imageName" - "--machine-type=$machineType" + --zone "$zone" + --tags testnet + --metadata "testnet=$networkName" + --image "$imageName" + --maintenance-policy TERMINATE + --no-restart-on-failure ) + + # shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args + args+=(--machine-type $machineType) if [[ -n $optionalBootDiskSize ]]; then args+=( - "--boot-disk-size=${optionalBootDiskSize}GB" - ) - fi - if [[ $optionalGpu = true ]]; then - args+=( - "--accelerator=count=4,type=nvidia-tesla-k80" - --maintenance-policy TERMINATE - --restart-on-failure + --boot-disk-size "${optionalBootDiskSize}GB" ) fi if [[ -n $optionalStartupScript ]]; then @@ -152,7 +150,7 @@ cloud_CreateInstances() { exit 1 } args+=( - "--address=$optionalAddress" + --address "$optionalAddress" ) fi