Add GPU support to ec2-provider
This commit is contained in:
parent
f89f121d2b
commit
155ee8792f
19
net/gce.sh
19
net/gce.sh
|
@ -11,7 +11,9 @@ gce)
|
|||
source "$here"/scripts/gce-provider.sh
|
||||
|
||||
imageName="ubuntu-16-04-cuda-9-2-new"
|
||||
leaderMachineType=n1-standard-16
|
||||
cpuLeaderMachineType=n1-standard-16
|
||||
gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80"
|
||||
leaderMachineType=$cpuLeaderMachineType
|
||||
validatorMachineType=n1-standard-4
|
||||
clientMachineType=n1-standard-16
|
||||
;;
|
||||
|
@ -19,8 +21,10 @@ ec2)
|
|||
# shellcheck source=net/scripts/ec2-provider.sh
|
||||
source "$here"/scripts/ec2-provider.sh
|
||||
|
||||
imageName="ami-04169656fea786776"
|
||||
leaderMachineType=m4.4xlarge
|
||||
imageName="ami-0466e26ccc0e752c1"
|
||||
cpuLeaderMachineType=m4.4xlarge
|
||||
gpuLeaderMachineType=p2.xlarge
|
||||
leaderMachineType=$cpuLeaderMachineType
|
||||
validatorMachineType=m4.xlarge
|
||||
clientMachineType=m4.4xlarge
|
||||
;;
|
||||
|
@ -35,7 +39,7 @@ validatorNodeCount=5
|
|||
clientNodeCount=1
|
||||
leaderBootDiskSizeInGb=1000
|
||||
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
|
||||
clientBootDiskSizeInGb=40
|
||||
clientBootDiskSizeInGb=75
|
||||
|
||||
publicNetwork=false
|
||||
enableGpu=false
|
||||
|
@ -111,6 +115,7 @@ while getopts "h?p:Pn:c:z:ga:" opt; do
|
|||
;;
|
||||
g)
|
||||
enableGpu=true
|
||||
leaderMachineType="$gpuLeaderMachineType"
|
||||
;;
|
||||
a)
|
||||
leaderAddress=$OPTARG
|
||||
|
@ -372,16 +377,16 @@ touch /.instance-startup-complete
|
|||
EOF
|
||||
|
||||
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
|
||||
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
|
||||
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \
|
||||
"$startupScript" "$leaderAddress"
|
||||
|
||||
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
|
||||
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
|
||||
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \
|
||||
"$startupScript" ""
|
||||
|
||||
if [[ $clientNodeCount -gt 0 ]]; then
|
||||
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
|
||||
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
|
||||
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \
|
||||
"$startupScript" ""
|
||||
fi
|
||||
|
||||
|
|
|
@ -104,8 +104,7 @@ cloud_FindInstance() {
|
|||
|
||||
#
|
||||
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
|
||||
# [machineType] [bootDiskSize] [enableGpu]
|
||||
# [startupScript] [address]
|
||||
# [machineType] [bootDiskSize] [startupScript] [address]
|
||||
#
|
||||
# Creates one more identical instances.
|
||||
#
|
||||
|
@ -115,8 +114,6 @@ cloud_FindInstance() {
|
|||
# imageName - Disk image for the instances
|
||||
# machineType - GCE machine type
|
||||
# bootDiskSize - Optional size of the boot disk in GB
|
||||
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||
# startupScript - Optional startup script to execute when the instance boots
|
||||
# address - Optional name of the GCE static IP address to attach to the
|
||||
# instance. Requires that |numNodes| = 1 and that addressName
|
||||
|
@ -131,9 +128,8 @@ cloud_CreateInstances() {
|
|||
declare imageName="$4"
|
||||
declare machineType="$5"
|
||||
declare optionalBootDiskSize="$6"
|
||||
declare optionalGpu="$7"
|
||||
declare optionalStartupScript="$8"
|
||||
declare optionalAddress="$9"
|
||||
declare optionalStartupScript="$7"
|
||||
declare optionalAddress="$8"
|
||||
|
||||
__cloud_SshPrivateKeyCheck
|
||||
(
|
||||
|
@ -159,10 +155,6 @@ cloud_CreateInstances() {
|
|||
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
|
||||
)
|
||||
fi
|
||||
if [[ $optionalGpu = true ]]; then
|
||||
echo TODO: GPU support not implemented yet
|
||||
exit 1
|
||||
fi
|
||||
if [[ -n $optionalStartupScript ]]; then
|
||||
args+=(
|
||||
--user-data "file://$optionalStartupScript"
|
||||
|
@ -189,10 +181,16 @@ cloud_CreateInstances() {
|
|||
|
||||
declare instanceId
|
||||
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
|
||||
aws ec2 associate-address \
|
||||
--instance-id "$instanceId" \
|
||||
--region "region" \
|
||||
--allocation-id "$optionalAddress"
|
||||
(
|
||||
set -x
|
||||
# TODO: Poll that the instance has moved to the 'running' state instead of
|
||||
# blindly sleeping for 30 seconds...
|
||||
sleep 30
|
||||
aws ec2 associate-address \
|
||||
--instance-id "$instanceId" \
|
||||
--region "$region" \
|
||||
--allocation-id "$optionalAddress"
|
||||
)
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ __cloud_FindInstances() {
|
|||
|
||||
instances+=("$name:$publicIp:$privateIp")
|
||||
done < <(gcloud compute instances list \
|
||||
--filter="$filter" \
|
||||
--filter "$filter" \
|
||||
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
|
||||
}
|
||||
#
|
||||
|
@ -91,7 +91,9 @@ cloud_FindInstance() {
|
|||
# namePrefix - unique string to prefix all the instance names with
|
||||
# numNodes - number of instances to create
|
||||
# imageName - Disk image for the instances
|
||||
# machineType - GCE machine type
|
||||
# machineType - GCE machine type. Note that this may also include an
|
||||
# `--accelerator=` or other |gcloud compute instances create|
|
||||
# options
|
||||
# bootDiskSize - Optional size of the boot disk in GB
|
||||
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||
|
@ -109,9 +111,8 @@ cloud_CreateInstances() {
|
|||
declare imageName="$4"
|
||||
declare machineType="$5"
|
||||
declare optionalBootDiskSize="$6"
|
||||
declare optionalGpu="$7"
|
||||
declare optionalStartupScript="$8"
|
||||
declare optionalAddress="$9"
|
||||
declare optionalStartupScript="$7"
|
||||
declare optionalAddress="$8"
|
||||
|
||||
declare nodes
|
||||
if [[ $numNodes = 1 ]]; then
|
||||
|
@ -122,22 +123,19 @@ cloud_CreateInstances() {
|
|||
|
||||
declare -a args
|
||||
args=(
|
||||
"--zone=$zone"
|
||||
"--tags=testnet"
|
||||
"--metadata=testnet=$networkName"
|
||||
"--image=$imageName"
|
||||
"--machine-type=$machineType"
|
||||
--zone "$zone"
|
||||
--tags testnet
|
||||
--metadata "testnet=$networkName"
|
||||
--image "$imageName"
|
||||
--maintenance-policy TERMINATE
|
||||
--no-restart-on-failure
|
||||
)
|
||||
|
||||
# shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
|
||||
args+=(--machine-type $machineType)
|
||||
if [[ -n $optionalBootDiskSize ]]; then
|
||||
args+=(
|
||||
"--boot-disk-size=${optionalBootDiskSize}GB"
|
||||
)
|
||||
fi
|
||||
if [[ $optionalGpu = true ]]; then
|
||||
args+=(
|
||||
"--accelerator=count=4,type=nvidia-tesla-k80"
|
||||
--maintenance-policy TERMINATE
|
||||
--restart-on-failure
|
||||
--boot-disk-size "${optionalBootDiskSize}GB"
|
||||
)
|
||||
fi
|
||||
if [[ -n $optionalStartupScript ]]; then
|
||||
|
@ -152,7 +150,7 @@ cloud_CreateInstances() {
|
|||
exit 1
|
||||
}
|
||||
args+=(
|
||||
"--address=$optionalAddress"
|
||||
--address "$optionalAddress"
|
||||
)
|
||||
fi
|
||||
|
||||
|
|
Loading…
Reference in New Issue