Add GPU support to ec2-provider

This commit is contained in:
Michael Vines 2018-09-17 08:25:10 -07:00
parent f89f121d2b
commit 155ee8792f
3 changed files with 42 additions and 41 deletions

View File

@ -11,7 +11,9 @@ gce)
source "$here"/scripts/gce-provider.sh
imageName="ubuntu-16-04-cuda-9-2-new"
leaderMachineType=n1-standard-16
cpuLeaderMachineType=n1-standard-16
gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80"
leaderMachineType=$cpuLeaderMachineType
validatorMachineType=n1-standard-4
clientMachineType=n1-standard-16
;;
@ -19,8 +21,10 @@ ec2)
# shellcheck source=net/scripts/ec2-provider.sh
source "$here"/scripts/ec2-provider.sh
imageName="ami-04169656fea786776"
leaderMachineType=m4.4xlarge
imageName="ami-0466e26ccc0e752c1"
cpuLeaderMachineType=m4.4xlarge
gpuLeaderMachineType=p2.xlarge
leaderMachineType=$cpuLeaderMachineType
validatorMachineType=m4.xlarge
clientMachineType=m4.4xlarge
;;
@ -35,7 +39,7 @@ validatorNodeCount=5
clientNodeCount=1
leaderBootDiskSizeInGb=1000
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
clientBootDiskSizeInGb=40
clientBootDiskSizeInGb=75
publicNetwork=false
enableGpu=false
@ -111,6 +115,7 @@ while getopts "h?p:Pn:c:z:ga:" opt; do
;;
g)
enableGpu=true
leaderMachineType="$gpuLeaderMachineType"
;;
a)
leaderAddress=$OPTARG
@ -372,16 +377,16 @@ touch /.instance-startup-complete
EOF
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \
"$startupScript" "$leaderAddress"
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \
"$startupScript" ""
if [[ $clientNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \
"$startupScript" ""
fi

View File

@ -104,8 +104,7 @@ cloud_FindInstance() {
#
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
# [machineType] [bootDiskSize] [enableGpu]
# [startupScript] [address]
# [machineType] [bootDiskSize] [startupScript] [address]
#
# Creates one more identical instances.
#
@ -115,8 +114,6 @@ cloud_FindInstance() {
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
@ -131,9 +128,8 @@ cloud_CreateInstances() {
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalGpu="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare optionalStartupScript="$7"
declare optionalAddress="$8"
__cloud_SshPrivateKeyCheck
(
@ -159,10 +155,6 @@ cloud_CreateInstances() {
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
)
fi
if [[ $optionalGpu = true ]]; then
echo TODO: GPU support not implemented yet
exit 1
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--user-data "file://$optionalStartupScript"
@ -189,10 +181,16 @@ cloud_CreateInstances() {
declare instanceId
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
(
set -x
# TODO: Poll that the instance has moved to the 'running' state instead of
# blindly sleeping for 30 seconds...
sleep 30
aws ec2 associate-address \
--instance-id "$instanceId" \
--region "region" \
--region "$region" \
--allocation-id "$optionalAddress"
)
fi
}

View File

@ -39,7 +39,7 @@ __cloud_FindInstances() {
instances+=("$name:$publicIp:$privateIp")
done < <(gcloud compute instances list \
--filter="$filter" \
--filter "$filter" \
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
}
#
@ -91,7 +91,9 @@ cloud_FindInstance() {
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# imageName - Disk image for the instances
# machineType - GCE machine type
# machineType - GCE machine type. Note that this may also include an
# `--accelerator=` or other |gcloud compute instances create|
# options
# bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
@ -109,9 +111,8 @@ cloud_CreateInstances() {
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalGpu="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare optionalStartupScript="$7"
declare optionalAddress="$8"
declare nodes
if [[ $numNodes = 1 ]]; then
@ -122,22 +123,19 @@ cloud_CreateInstances() {
declare -a args
args=(
"--zone=$zone"
"--tags=testnet"
"--metadata=testnet=$networkName"
"--image=$imageName"
"--machine-type=$machineType"
--zone "$zone"
--tags testnet
--metadata "testnet=$networkName"
--image "$imageName"
--maintenance-policy TERMINATE
--no-restart-on-failure
)
# shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
args+=(--machine-type $machineType)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
"--boot-disk-size=${optionalBootDiskSize}GB"
)
fi
if [[ $optionalGpu = true ]]; then
args+=(
"--accelerator=count=4,type=nvidia-tesla-k80"
--maintenance-policy TERMINATE
--restart-on-failure
--boot-disk-size "${optionalBootDiskSize}GB"
)
fi
if [[ -n $optionalStartupScript ]]; then
@ -152,7 +150,7 @@ cloud_CreateInstances() {
exit 1
}
args+=(
"--address=$optionalAddress"
--address "$optionalAddress"
)
fi