Add AWS EC2 support

This commit is contained in:
Michael Vines 2018-09-16 14:46:08 -07:00
parent 27986d7abb
commit f89f121d2b
7 changed files with 644 additions and 271 deletions

View File

@ -5,15 +5,30 @@ intended to be both dev and CD friendly.
### User Account Prerequisites
Log in to GCP with:
GCP and AWS are supported.
#### GCP
First authenticate with
```bash
$ gcloud auth login
```
Also ensure that `$(whoami)` is the name of an InfluxDB user account with enough
access to create a new database.
#### AWS
Obtain your credentials from the AWS IAM Console and configure the AWS CLI with
```bash
$ aws configure
```
More information on AWS CLI configuration can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-quick-configuration)
### Metrics configuration
Ensure that `$(whoami)` is the name of an InfluxDB user account with enough
access to create a new InfluxDB database. Ask mvines@ for help if needed.
## Quick Start
NOTE: This example uses GCP. If you are using AWS, replace `./gce.sh` with
`./ec2.sh` in the commands.
```bash
$ cd net/
$ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here)
@ -32,6 +47,10 @@ network over public IP addresses:
```bash
$ ./gce.sh create -P ...
```
or
```bash
$ ./ec2.sh create -P ...
```
### Deploying a Snap-based network
To deploy the latest pre-built `edge` channel Snap (ie, latest from the `master`
@ -46,6 +65,10 @@ First ensure the network instances are created with GPU enabled:
```bash
$ ./gce.sh create -g ...
```
or
```bash
$ ./ec2.sh create -g ...
```
If deploying a Snap-based network nothing further is required, as GPU presence
is detected at runtime and the CUDA build is auto selected.
@ -58,9 +81,20 @@ $ ./net.sh start -f "cuda,erasure"
### How to interact with a CD testnet deployed by ci/testnet-deploy.sh
**AWS-Specific Extra Setup**: Follow the steps in `scripts/add-solana-user-authorized_keys.sh`,
then redeploy the testnet before continuing in this section.
Taking **master-testnet-solana-com** as an example, configure your workspace for
the testnet using:
```
```bash
$ ./gce.sh config -p master-testnet-solana-com
$ ./ssh.sh # <-- Details on how to ssh into any testnet node
```
or
```bash
$ ./ec2.sh config -p master-testnet-solana-com
```
Then run the following for details on how to ssh into any testnet node
```bash
$ ./ssh.sh
```

1
net/ec2.sh Symbolic link
View File

@ -0,0 +1 @@
gce.sh

View File

@ -1,27 +1,44 @@
#!/bin/bash -e
here=$(dirname "$0")
# shellcheck source=net/scripts/gcloud.sh
source "$here"/scripts/gcloud.sh
# shellcheck source=net/common.sh
source "$here"/common.sh
cloudProvider=$(basename "$0" .sh)
case $cloudProvider in
gce)
# shellcheck source=net/scripts/gce-provider.sh
source "$here"/scripts/gce-provider.sh
imageName="ubuntu-16-04-cuda-9-2-new"
leaderMachineType=n1-standard-16
validatorMachineType=n1-standard-4
clientMachineType=n1-standard-16
;;
ec2)
# shellcheck source=net/scripts/ec2-provider.sh
source "$here"/scripts/ec2-provider.sh
imageName="ami-04169656fea786776"
leaderMachineType=m4.4xlarge
validatorMachineType=m4.xlarge
clientMachineType=m4.4xlarge
;;
*)
echo "Error: Unknown cloud provider: $cloudProvider"
;;
esac
prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
validatorNodeCount=5
clientNodeCount=1
leaderBootDiskSize=1TB
leaderMachineType=n1-standard-16
leaderAccelerator=
validatorMachineType=n1-standard-4
validatorBootDiskSize=$leaderBootDiskSize
validatorAccelerator=
clientMachineType=n1-standard-16
clientBootDiskSize=40GB
clientAccelerator=
leaderBootDiskSizeInGb=1000
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
clientBootDiskSizeInGb=40
imageName="ubuntu-16-04-cuda-9-2-new"
publicNetwork=false
zone="us-west1-b"
enableGpu=false
leaderAddress=
usage() {
@ -33,7 +50,7 @@ usage() {
cat <<EOF
usage: $0 [create|config|delete] [common options] [command-specific options]
Configure a GCE-based testnet
Manage testnet instances
create - create a new testnet (implies 'config')
config - configure the testnet and write a config file describing it
@ -47,10 +64,13 @@ Configure a GCE-based testnet
-n [number] - Number of validator nodes (default: $validatorNodeCount)
-c [number] - Number of client nodes (default: $clientNodeCount)
-P - Use public network IP addresses (default: $publicNetwork)
-z [zone] - GCP Zone for the nodes (default: $zone)
-i [imageName] - Existing image on GCE (default: $imageName)
-g - Enable GPU
-a [address] - Set the leader node's external IP address to this GCE address
-z [zone] - Zone for the nodes (default: $zone)
-g - Enable GPU (default: $enableGpu)
-a [address] - Set the leader node's external IP address to this value.
For GCE, [address] is the "name" of the desired External
IP Address.
For EC2, [address] is the "allocation ID" of the desired
Elastic IP.
config-specific options:
none
@ -68,7 +88,7 @@ command=$1
shift
[[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command"
while getopts "h?p:Pi:n:c:z:ga:" opt; do
while getopts "h?p:Pn:c:z:ga:" opt; do
case $opt in
h | \?)
usage
@ -80,9 +100,6 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
P)
publicNetwork=true
;;
i)
imageName=$OPTARG
;;
n)
validatorNodeCount=$OPTARG
;;
@ -90,10 +107,10 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
clientNodeCount=$OPTARG
;;
z)
zone=$OPTARG
cloud_SetZone "$OPTARG"
;;
g)
leaderAccelerator="count=4,type=nvidia-tesla-k80"
enableGpu=true
;;
a)
leaderAddress=$OPTARG
@ -108,6 +125,37 @@ shift $((OPTIND - 1))
[[ -z $1 ]] || usage "Unexpected argument: $1"
sshPrivateKey="$netConfigDir/id_$prefix"
# cloud_ForEachInstance [cmd] [extra args to cmd]
#
# Execute a command for each element in the `instances` array
#
# cmd - The command to execute on each instance
# The command will receive arguments followed by any
# additionl arguments supplied to cloud_ForEachInstance:
# name - name of the instance
# publicIp - The public IP address of this instance
# privateIp - The priate IP address of this instance
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
# ... - Extra args to cmd..
#
#
cloud_ForEachInstance() {
declare cmd="$1"
shift
[[ -n $cmd ]] || { echo cloud_ForEachInstance: cmd not specified; exit 1; }
declare count=1
for info in "${instances[@]}"; do
declare name publicIp privateIp
IFS=: read -r name publicIp privateIp < <(echo "$info")
eval "$cmd" "$name" "$publicIp" "$privateIp" "$count" "$@"
count=$((count + 1))
done
}
prepareInstancesAndWriteConfigFile() {
$metricsWriteDatapoint "testnet-deploy net-config-begin=1"
@ -122,10 +170,10 @@ EOF
recordInstanceIp() {
declare name="$1"
declare publicIp="$3"
declare privateIp="$4"
declare publicIp="$2"
declare privateIp="$3"
declare arrayName="$6"
declare arrayName="$5"
echo "$arrayName+=($publicIp) # $name" >> "$configFile"
if [[ $arrayName = "leaderIp" ]]; then
@ -139,121 +187,133 @@ EOF
waitForStartupComplete() {
declare name="$1"
declare publicIp="$3"
declare publicIp="$2"
echo "Waiting for $name to finish booting..."
(
for i in $(seq 1 30); do
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.instance-startup-complete"); then
break
fi
sleep 2
echo "Retry $i..."
done
)
echo "$name has booted."
}
echo "Looking for leader instance..."
gcloud_FindInstances "name=$prefix-leader" show
cloud_FindInstance "$prefix-leader"
[[ ${#instances[@]} -eq 1 ]] || {
echo "Unable to find leader"
exit 1
}
echo "Fetching $sshPrivateKey from $leaderName"
(
rm -rf "$sshPrivateKey"{,pub}
declare leaderName
declare leaderZone
declare leaderIp
IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}")
IFS=: read -r leaderName leaderIp _ < <(echo "${instances[0]}")
set -x
# Try to ping the machine first.
timeout 60s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
if [[ ! -r $sshPrivateKey ]]; then
echo "Fetching $sshPrivateKey from $leaderName"
# Try to scp in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -o pipefail
for i in $(seq 1 10); do
if gcloud compute scp --zone "$leaderZone" \
"$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then
break
fi
sleep 1
echo "Retry $i..."
done
# Try to scp in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -x -o pipefail
for i in $(seq 1 30); do
if cloud_FetchFile "$leaderName" "$leaderIp" /solana-id_ecdsa "$sshPrivateKey"; then
break
fi
chmod 400 "$sshPrivateKey"
sleep 1
echo "Retry $i..."
done
chmod 400 "$sshPrivateKey"
ls -l "$sshPrivateKey"
fi
)
echo "leaderIp=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp leaderIp
gcloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp leaderIp
cloud_ForEachInstance waitForStartupComplete
echo "Looking for validator instances..."
gcloud_FindInstances "name~^$prefix-validator" show
cloud_FindInstances "$prefix-validator"
[[ ${#instances[@]} -gt 0 ]] || {
echo "Unable to find validators"
exit 1
}
echo "validatorIpList=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp validatorIpList
gcloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp validatorIpList
cloud_ForEachInstance waitForStartupComplete
echo "clientIpList=()" >> "$configFile"
echo "Looking for client instances..."
gcloud_FindInstances "name~^$prefix-client" show
cloud_FindInstances "$prefix-client"
[[ ${#instances[@]} -eq 0 ]] || {
gcloud_ForEachInstance recordInstanceIp clientIpList
gcloud_ForEachInstance waitForStartupComplete
cloud_ForEachInstance recordInstanceIp clientIpList
cloud_ForEachInstance waitForStartupComplete
}
echo "Wrote $configFile"
$metricsWriteDatapoint "testnet-deploy net-config-complete=1"
}
case $command in
delete)
delete() {
$metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
# Delete the leader node first to prevent unusual metrics on the dashboard
# during shutdown.
# TODO: It would be better to fully cut-off metrics reporting before any
# instances are deleted.
for filter in "^$prefix-leader" "^$prefix-"; do
gcloud_FindInstances "name~$filter"
for filter in "$prefix-leader" "$prefix-"; do
echo "Searching for instances: $filter"
cloud_FindInstances "$filter"
if [[ ${#instances[@]} -eq 0 ]]; then
echo "No instances found matching '$filter'"
else
gcloud_DeleteInstances true
cloud_DeleteInstances true
fi
done
rm -f "$configFile"
$metricsWriteDatapoint "testnet-deploy net-delete-complete=1"
}
case $command in
delete)
delete
;;
create)
[[ -n $validatorNodeCount ]] || usage "Need number of nodes"
if [[ $validatorNodeCount -le 0 ]]; then
usage "One or more validator nodes is required"
fi
delete
$metricsWriteDatapoint "testnet-deploy net-create-begin=1"
rm -rf "$sshPrivateKey"{,.pub}
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
# Note: using rsa because |aws ec2 import-key-pair| seems to fail for ecdsa
ssh-keygen -t rsa -N '' -f "$sshPrivateKey"
printNetworkInfo() {
cat <<EOF
========================================================================================
Network composition:
Leader = $leaderMachineType (GPU=${leaderAccelerator:-none})
Validators = $validatorNodeCount x $validatorMachineType (GPU=${validatorAccelerator:-none})
Client(s) = $clientNodeCount x $clientMachineType (GPU=${clientAccelerator:-none})
Leader = $leaderMachineType (GPU=$enableGpu)
Validators = $validatorNodeCount x $validatorMachineType
Client(s) = $clientNodeCount x $clientMachineType
========================================================================================
@ -261,7 +321,7 @@ EOF
}
printNetworkInfo
declare startupScript="$netConfigDir"/gce-startup-script.sh
declare startupScript="$netConfigDir"/instance-startup-script.sh
cat > "$startupScript" <<EOF
#!/bin/bash -ex
# autogenerated at $(date)
@ -270,11 +330,12 @@ cat > /etc/motd <<EOM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
This instance has not been fully configured.
See "startup-script" log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | grep startup-script
See startup script log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | egrep \\(startup-script\\|cloud-init\)
To block until setup is complete, run:
$ until [[ -f /.gce-startup-complete ]]; do sleep 1; done
$ until [[ -f /.instance-startup-complete ]]; do sleep 1; done
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
EOM
@ -296,6 +357,7 @@ $(
cat \
disable-background-upgrades.sh \
create-solana-user.sh \
add-solana-user-authorized_keys.sh \
install-earlyoom.sh \
install-libssl-compatability.sh \
install-rsync.sh \
@ -305,21 +367,21 @@ cat > /etc/motd <<EOM
$(printNetworkInfo)
EOM
touch /.gce-startup-complete
touch /.instance-startup-complete
EOF
gcloud_CreateInstances "$prefix-leader" 1 "$zone" \
"$imageName" "$leaderMachineType" "$leaderBootDiskSize" "$leaderAccelerator" \
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
"$startupScript" "$leaderAddress"
gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" \
"$imageName" "$validatorMachineType" "$validatorBootDiskSize" "$validatorAccelerator" \
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
"$startupScript" ""
if [[ $clientNodeCount -gt 0 ]]; then
gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" \
"$imageName" "$clientMachineType" "$clientBootDiskSize" "$clientAccelerator" \
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
"$startupScript" ""
fi

View File

@ -0,0 +1,20 @@
#!/bin/bash -ex
[[ $(uname) = Linux ]] || exit 1
[[ $USER = root ]] || exit 1
[[ -d /home/solana/.ssh ]] || exit 1
# /solana-authorized_keys contains the public keys for users that should
# automatically be granted access to ALL testnets.
#
# To add an entry into this list:
# 1. Run: ssh-keygen -t ecdsa -N '' -f ~/.ssh/id-solana-testnet
# 2. Inline ~/.ssh/id-solana-testnet.pub below
cat > /solana-authorized_keys <<EOF
ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFBNwLw0i+rI312gWshojFlNw9NV7WfaKeeUsYADqOvM2o4yrO2pPw+sgW8W+/rPpVyH7zU9WVRgTME8NgFV1Vc=
EOF
sudo -u solana bash -c "
cat /solana-authorized_keys >> /home/solana/.ssh/authorized_keys
"

242
net/scripts/ec2-provider.sh Normal file
View File

@ -0,0 +1,242 @@
# |source| this file
#
# Utilities for working with EC2 instances
#
zone=
region=
cloud_SetZone() {
zone="$1"
# AWS region is zone with the last character removed
region="${zone:0:$((${#zone} - 1))}"
}
# Set the default zone
cloud_SetZone "us-east-1b"
# sshPrivateKey should be globally defined whenever this function is called.
#
# TODO: Remove usage of the sshPrivateKey global
__cloud_SshPrivateKeyCheck() {
# shellcheck disable=SC2154
if [[ -z $sshPrivateKey ]]; then
echo Error: sshPrivateKey not defined
exit 1
fi
if [[ ! -r $sshPrivateKey ]]; then
echo "Error: file is not readable: $sshPrivateKey"
exit 1
fi
}
#
# __cloud_FindInstances
#
# Find instances with name matching the specified pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# filter - The instances to filter on
#
# examples:
# $ __cloud_FindInstances "exact-machine-name"
# $ __cloud_FindInstances "all-machines-with-a-common-machine-prefix*"
#
__cloud_FindInstances() {
declare filter="$1"
instances=()
declare name publicIp privateIp
while read -r name publicIp privateIp; do
printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
instances+=("$name:$publicIp:$privateIp")
done < <(aws ec2 describe-instances \
--region "$region" \
--filters \
"Name=tag:name,Values=$filter" \
"Name=instance-state-name,Values=pending,running" \
--query "Reservations[].Instances[].[InstanceId,PublicIpAddress,PrivateIpAddress]" \
--output text
)
}
#
# cloud_FindInstances [namePrefix]
#
# Find instances with names matching the specified prefix
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# namePrefix - The instance name prefix to look for
#
# examples:
# $ cloud_FindInstances all-machines-with-a-common-machine-prefix
#
cloud_FindInstances() {
declare namePrefix="$1"
__cloud_FindInstances "$namePrefix*"
}
#
# cloud_FindInstance [name]
#
# Find an instance with a name matching the exact pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# name - The instance name to look for
#
# examples:
# $ cloud_FindInstance exact-machine-name
#
cloud_FindInstance() {
declare name="$1"
__cloud_FindInstances "$name"
}
#
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
# [machineType] [bootDiskSize] [enableGpu]
# [startupScript] [address]
#
# Creates one more identical instances.
#
# networkName - unique name of this testnet
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting |zone|
#
# Tip: use cloud_FindInstances to locate the instances once this function
# returns
cloud_CreateInstances() {
declare networkName="$1"
declare namePrefix="$2"
declare numNodes="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalGpu="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
__cloud_SshPrivateKeyCheck
(
set -x
aws ec2 delete-key-pair --region "$region" --key-name "$networkName"
aws ec2 import-key-pair --region "$region" --key-name "$networkName" \
--public-key-material file://"${sshPrivateKey}".pub
)
declare -a args
args=(
--key-name "$networkName"
--count "$numNodes"
--region "$region"
--placement "AvailabilityZone=$zone"
--security-groups testnet
--image-id "$imageName"
--instance-type "$machineType"
--tag-specifications "ResourceType=instance,Tags=[{Key=name,Value=$namePrefix}]"
)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
)
fi
if [[ $optionalGpu = true ]]; then
echo TODO: GPU support not implemented yet
exit 1
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--user-data "file://$optionalStartupScript"
)
fi
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
}
fi
(
set -x
aws ec2 run-instances "${args[@]}"
)
if [[ -n $optionalAddress ]]; then
cloud_FindInstance "$namePrefix"
if [[ ${#instances[@]} -ne 1 ]]; then
echo "Failed to find newly created instance: $namePrefix"
fi
declare instanceId
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
aws ec2 associate-address \
--instance-id "$instanceId" \
--region "region" \
--allocation-id "$optionalAddress"
fi
}
#
# cloud_DeleteInstances
#
# Deletes all the instances listed in the `instances` array
#
cloud_DeleteInstances() {
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
return
fi
declare names=("${instances[@]/:*/}")
(
set -x
aws ec2 terminate-instances --region "$region" --instance-ids "${names[@]}"
)
}
#
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
#
# Fetch a file from the given instance. This function uses a cloud-specific
# mechanism to fetch the file
#
cloud_FetchFile() {
# shellcheck disable=SC2034 # instanceName is unused
declare instanceName="$1"
declare publicIp="$2"
declare remoteFile="$3"
declare localFile="$4"
__cloud_SshPrivateKeyCheck
(
set -x
scp \
-o "StrictHostKeyChecking=no" \
-o "UserKnownHostsFile=/dev/null" \
-o "User=solana" \
-o "IdentityFile=$sshPrivateKey" \
-o "LogLevel=ERROR" \
-F /dev/null \
"solana@$publicIp:$remoteFile" "$localFile"
)
}

201
net/scripts/gce-provider.sh Normal file
View File

@ -0,0 +1,201 @@
# |source| this file
#
# Utilities for working with GCE instances
#
# Default zone
zone="us-west1-b"
cloud_SetZone() {
zone="$1"
}
#
# __cloud_FindInstances
#
# Find instances matching the specified pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:zone:public IP:private IP"
#
# filter - The instances to filter on
#
# examples:
# $ __cloud_FindInstances "name=exact-machine-name"
# $ __cloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
#
__cloud_FindInstances() {
declare filter="$1"
instances=()
declare name zone publicIp privateIp status
while read -r name publicIp privateIp status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $name is not RUNNING, ignoring it."
continue
fi
printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
instances+=("$name:$publicIp:$privateIp")
done < <(gcloud compute instances list \
--filter="$filter" \
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
}
#
# cloud_FindInstances [namePrefix]
#
# Find instances with names matching the specified prefix
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# namePrefix - The instance name prefix to look for
#
# examples:
# $ cloud_FindInstances all-machines-with-a-common-machine-prefix
#
cloud_FindInstances() {
declare namePrefix="$1"
__cloud_FindInstances "name~^$namePrefix"
}
#
# cloud_FindInstance [name]
#
# Find an instance with a name matching the exact pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# name - The instance name to look for
#
# examples:
# $ cloud_FindInstance exact-machine-name
#
cloud_FindInstance() {
declare name="$1"
__cloud_FindInstances "name=$name"
}
#
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
# [machineType] [bootDiskSize] [enableGpu]
# [startupScript] [address]
#
# Creates one more identical instances.
#
# networkName - unique name of this testnet
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting `$zone`
#
# Tip: use cloud_FindInstances to locate the instances once this function
# returns
cloud_CreateInstances() {
declare networkName="$1"
declare namePrefix="$2"
declare numNodes="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalGpu="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare nodes
if [[ $numNodes = 1 ]]; then
nodes=("$namePrefix")
else
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
fi
declare -a args
args=(
"--zone=$zone"
"--tags=testnet"
"--metadata=testnet=$networkName"
"--image=$imageName"
"--machine-type=$machineType"
)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
"--boot-disk-size=${optionalBootDiskSize}GB"
)
fi
if [[ $optionalGpu = true ]]; then
args+=(
"--accelerator=count=4,type=nvidia-tesla-k80"
--maintenance-policy TERMINATE
--restart-on-failure
)
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--metadata-from-file "startup-script=$optionalStartupScript"
)
fi
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
}
args+=(
"--address=$optionalAddress"
)
fi
(
set -x
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
)
}
#
# cloud_DeleteInstances
#
# Deletes all the instances listed in the `instances` array
#
cloud_DeleteInstances() {
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
return
fi
declare names=("${instances[@]/:*/}")
(
set -x
gcloud beta compute instances delete --zone "$zone" --quiet "${names[@]}"
)
}
#
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
#
# Fetch a file from the given instance. This function uses a cloud-specific
# mechanism to fetch the file
#
cloud_FetchFile() {
declare instanceName="$1"
# shellcheck disable=SC2034 # publicIp is unused
declare publicIp="$2"
declare remoteFile="$3"
declare localFile="$4"
(
set -x
gcloud compute scp --zone "$zone" "$instanceName:$remoteFile" "$localFile"
)
}

View File

@ -1,187 +0,0 @@
# |source| this file
#
# Utilities for working with gcloud
#
#
# gcloud_FindInstances [filter] [options]
#
# Find instances matching the specified pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:zone:public IP:private IP"
#
# filter - The instances to filter on
# options - If set to the string "show", the list of instances will be echoed
# to stdout
#
# examples:
# $ gcloud_FindInstances "name=exact-machine-name"
# $ gcloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
#
gcloud_FindInstances() {
declare filter="$1"
declare options="$2"
instances=()
declare name zone publicIp privateIp status
while read -r name zone publicIp privateIp status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $name is not RUNNING, ignoring it."
continue
fi
if [[ $options = show ]]; then
printf "%-30s | %-16s publicIp=%-16s privateIp=%s\n" "$name" "$zone" "$publicIp" "$privateIp"
fi
instances+=("$name:$zone:$publicIp:$privateIp")
done < <(gcloud compute instances list \
--filter="$filter" \
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
}
#
# gcloud_ForEachInstance [cmd] [extra args to cmd]
#
# Execute a command for each element in the `instances` array
#
# cmd - The command to execute on each instance
# The command will receive arguments followed by any
# additionl arguments supplied to gcloud_ForEachInstance:
# name - name of the instance
# zone - zone the instance is located in
# publicIp - The public IP address of this instance
# privateIp - The priate IP address of this instance
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
# ... - Extra args to cmd..
#
#
gcloud_ForEachInstance() {
declare cmd="$1"
shift
[[ -n $cmd ]] || { echo gcloud_ForEachInstance: cmd not specified; exit 1; }
declare count=1
for info in "${instances[@]}"; do
declare name zone publicIp privateIp
IFS=: read -r name zone publicIp privateIp < <(echo "$info")
eval "$cmd" "$name" "$zone" "$publicIp" "$privateIp" "$count" "$@"
count=$((count + 1))
done
}
#
# gcloud_CreateInstances [namePrefix] [numNodes] [zone] [imageName]
# [machineType] [bootDiskSize] [accelerator]
# [startupScript] [address]
#
# Creates one more identical instances.
#
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# zone - zone to create the instances in
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional disk of the boot disk
# accelerator - Optional accelerator to attach to the instance(s), see
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting |zone|
#
# Tip: use gcloud_FindInstances to locate the instances once this function
# returns
gcloud_CreateInstances() {
declare namePrefix="$1"
declare numNodes="$2"
declare zone="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalAccelerator="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare nodes
if [[ $numNodes = 1 ]]; then
nodes=("$namePrefix")
else
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
fi
declare -a args
args=(
"--zone=$zone"
"--tags=testnet"
"--image=$imageName"
"--machine-type=$machineType"
)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
"--boot-disk-size=$optionalBootDiskSize"
)
fi
if [[ -n $optionalAccelerator ]]; then
args+=(
"--accelerator=$optionalAccelerator"
--maintenance-policy TERMINATE
--restart-on-failure
)
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--metadata-from-file "startup-script=$optionalStartupScript"
)
fi
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
}
args+=(
"--address=$optionalAddress"
)
fi
(
set -x
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
)
}
#
# gcloud_DeleteInstances [yes]
#
# Deletes all the instances listed in the `instances` array
#
# If yes = "true", skip the delete confirmation
#
gcloud_DeleteInstances() {
declare maybeQuiet=
if [[ $1 = true ]]; then
maybeQuiet=--quiet
fi
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
return
fi
declare names=("${instances[@]/:*/}")
# Assume all instances are in the same zone
# TODO: One day this assumption will be invalid
declare zone
IFS=: read -r _ zone _ < <(echo "${instances[0]}")
(
set -x
gcloud beta compute instances delete --zone "$zone" $maybeQuiet "${names[@]}"
)
}