Add GPU support to ec2-provider

2018-09-17 08:25:10 -07:00 · 2018-09-17 08:25:10 -07:00 · 155ee8792f
parent f89f121d2b
commit 155ee8792f
3 changed files with 42 additions and 41 deletions
--- a/net/gce.sh
+++ b/net/gce.sh
@ -11,7 +11,9 @@ gce)
  source "$here"/scripts/gce-provider.sh

  imageName="ubuntu-16-04-cuda-9-2-new"
-  leaderMachineType=n1-standard-16
+  cpuLeaderMachineType=n1-standard-16
+  gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80"
+  leaderMachineType=$cpuLeaderMachineType
  validatorMachineType=n1-standard-4
  clientMachineType=n1-standard-16
  ;;
@ -19,8 +21,10 @@ ec2)
  # shellcheck source=net/scripts/ec2-provider.sh
  source "$here"/scripts/ec2-provider.sh

-  imageName="ami-04169656fea786776"
-  leaderMachineType=m4.4xlarge
+  imageName="ami-0466e26ccc0e752c1"
+  cpuLeaderMachineType=m4.4xlarge
+  gpuLeaderMachineType=p2.xlarge
+  leaderMachineType=$cpuLeaderMachineType
  validatorMachineType=m4.xlarge
  clientMachineType=m4.4xlarge
  ;;
@ -35,7 +39,7 @@ validatorNodeCount=5
 clientNodeCount=1
 leaderBootDiskSizeInGb=1000
 validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
-clientBootDiskSizeInGb=40
+clientBootDiskSizeInGb=75

 publicNetwork=false
 enableGpu=false
@ -111,6 +115,7 @@ while getopts "h?p:Pn:c:z:ga:" opt; do
    ;;
  g)
    enableGpu=true
+    leaderMachineType="$gpuLeaderMachineType"
    ;;
  a)
    leaderAddress=$OPTARG
@ -372,16 +377,16 @@ touch /.instance-startup-complete
 EOF

  cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
-    "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
+    "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \
    "$startupScript" "$leaderAddress"

  cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
-    "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
+    "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \
    "$startupScript" ""

  if [[ $clientNodeCount -gt 0 ]]; then
    cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
-      "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
+      "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \
      "$startupScript" ""
  fi

--- a/net/scripts/ec2-provider.sh
+++ b/net/scripts/ec2-provider.sh
@ -104,8 +104,7 @@ cloud_FindInstance() {

 #
 # cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
-#                       [machineType] [bootDiskSize] [enableGpu]
-#                       [startupScript] [address]
+#                       [machineType] [bootDiskSize] [startupScript] [address]
 #
 # Creates one more identical instances.
 #
@ -115,8 +114,6 @@ cloud_FindInstance() {
 # imageName     - Disk image for the instances
 # machineType   - GCE machine type
 # bootDiskSize  - Optional size of the boot disk in GB
-# enableGpu     - Optionally enable GPU, use the value "true" to enable
-#                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
 # startupScript - Optional startup script to execute when the instance boots
 # address       - Optional name of the GCE static IP address to attach to the
 #                 instance.  Requires that |numNodes| = 1 and that addressName
@ -131,9 +128,8 @@ cloud_CreateInstances() {
  declare imageName="$4"
  declare machineType="$5"
  declare optionalBootDiskSize="$6"
-  declare optionalGpu="$7"
-  declare optionalStartupScript="$8"
-  declare optionalAddress="$9"
+  declare optionalStartupScript="$7"
+  declare optionalAddress="$8"

  __cloud_SshPrivateKeyCheck
  (
@ -159,10 +155,6 @@ cloud_CreateInstances() {
      --block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
    )
  fi
-  if [[ $optionalGpu = true ]]; then
-    echo TODO: GPU support not implemented yet
-    exit 1
-  fi
  if [[ -n $optionalStartupScript ]]; then
    args+=(
      --user-data "file://$optionalStartupScript"
@ -189,10 +181,16 @@ cloud_CreateInstances() {

    declare instanceId
    IFS=: read -r instanceId _ < <(echo "${instances[0]}")
+    (
+      set -x
+      # TODO: Poll that the instance has moved to the 'running' state instead of
+      #       blindly sleeping for 30 seconds...
+      sleep 30
      aws ec2 associate-address \
        --instance-id "$instanceId" \
-      --region "region" \
+        --region "$region" \
        --allocation-id "$optionalAddress"
+    )
  fi
 }

--- a/net/scripts/gce-provider.sh
+++ b/net/scripts/gce-provider.sh
@ -39,7 +39,7 @@ __cloud_FindInstances() {

    instances+=("$name:$publicIp:$privateIp")
  done < <(gcloud compute instances list \
-             --filter="$filter" \
+             --filter "$filter" \
             --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
 }
 #
@ -91,7 +91,9 @@ cloud_FindInstance() {
 # namePrefix    - unique string to prefix all the instance names with
 # numNodes      - number of instances to create
 # imageName     - Disk image for the instances
-# machineType   - GCE machine type
+# machineType   - GCE machine type.  Note that this may also include an
+#                 `--accelerator=` or other |gcloud compute instances create|
+#                 options
 # bootDiskSize  - Optional size of the boot disk in GB
 # enableGpu     - Optionally enable GPU, use the value "true" to enable
 #                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
@ -109,9 +111,8 @@ cloud_CreateInstances() {
  declare imageName="$4"
  declare machineType="$5"
  declare optionalBootDiskSize="$6"
-  declare optionalGpu="$7"
-  declare optionalStartupScript="$8"
-  declare optionalAddress="$9"
+  declare optionalStartupScript="$7"
+  declare optionalAddress="$8"

  declare nodes
  if [[ $numNodes = 1 ]]; then
@ -122,22 +123,19 @@ cloud_CreateInstances() {

  declare -a args
  args=(
-    "--zone=$zone"
-    "--tags=testnet"
-    "--metadata=testnet=$networkName"
-    "--image=$imageName"
-    "--machine-type=$machineType"
+    --zone "$zone"
+    --tags testnet
+    --metadata "testnet=$networkName"
+    --image "$imageName"
+    --maintenance-policy TERMINATE
+    --no-restart-on-failure
  )
+
+  # shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
+  args+=(--machine-type $machineType)
  if [[ -n $optionalBootDiskSize ]]; then
    args+=(
-      "--boot-disk-size=${optionalBootDiskSize}GB"
-    )
-  fi
-  if [[ $optionalGpu = true ]]; then
-    args+=(
-      "--accelerator=count=4,type=nvidia-tesla-k80"
-      --maintenance-policy TERMINATE
-      --restart-on-failure
+      --boot-disk-size "${optionalBootDiskSize}GB"
    )
  fi
  if [[ -n $optionalStartupScript ]]; then
@ -152,7 +150,7 @@ cloud_CreateInstances() {
      exit 1
    }
    args+=(
-      "--address=$optionalAddress"
+      --address "$optionalAddress"
    )
  fi