Add AWS EC2 support
This commit is contained in:
parent
27986d7abb
commit
f89f121d2b
|
@ -5,15 +5,30 @@ intended to be both dev and CD friendly.
|
|||
|
||||
### User Account Prerequisites
|
||||
|
||||
Log in to GCP with:
|
||||
GCP and AWS are supported.
|
||||
|
||||
#### GCP
|
||||
First authenticate with
|
||||
```bash
|
||||
$ gcloud auth login
|
||||
```
|
||||
|
||||
Also ensure that `$(whoami)` is the name of an InfluxDB user account with enough
|
||||
access to create a new database.
|
||||
#### AWS
|
||||
Obtain your credentials from the AWS IAM Console and configure the AWS CLI with
|
||||
```bash
|
||||
$ aws configure
|
||||
```
|
||||
More information on AWS CLI configuration can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-quick-configuration)
|
||||
|
||||
### Metrics configuration
|
||||
Ensure that `$(whoami)` is the name of an InfluxDB user account with enough
|
||||
access to create a new InfluxDB database. Ask mvines@ for help if needed.
|
||||
|
||||
## Quick Start
|
||||
|
||||
NOTE: This example uses GCP. If you are using AWS, replace `./gce.sh` with
|
||||
`./ec2.sh` in the commands.
|
||||
|
||||
```bash
|
||||
$ cd net/
|
||||
$ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here)
|
||||
|
@ -32,6 +47,10 @@ network over public IP addresses:
|
|||
```bash
|
||||
$ ./gce.sh create -P ...
|
||||
```
|
||||
or
|
||||
```bash
|
||||
$ ./ec2.sh create -P ...
|
||||
```
|
||||
|
||||
### Deploying a Snap-based network
|
||||
To deploy the latest pre-built `edge` channel Snap (ie, latest from the `master`
|
||||
|
@ -46,6 +65,10 @@ First ensure the network instances are created with GPU enabled:
|
|||
```bash
|
||||
$ ./gce.sh create -g ...
|
||||
```
|
||||
or
|
||||
```bash
|
||||
$ ./ec2.sh create -g ...
|
||||
```
|
||||
|
||||
If deploying a Snap-based network nothing further is required, as GPU presence
|
||||
is detected at runtime and the CUDA build is auto selected.
|
||||
|
@ -58,9 +81,20 @@ $ ./net.sh start -f "cuda,erasure"
|
|||
|
||||
### How to interact with a CD testnet deployed by ci/testnet-deploy.sh
|
||||
|
||||
**AWS-Specific Extra Setup**: Follow the steps in `scripts/add-solana-user-authorized_keys.sh`,
|
||||
then redeploy the testnet before continuing in this section.
|
||||
|
||||
Taking **master-testnet-solana-com** as an example, configure your workspace for
|
||||
the testnet using:
|
||||
```
|
||||
```bash
|
||||
$ ./gce.sh config -p master-testnet-solana-com
|
||||
$ ./ssh.sh # <-- Details on how to ssh into any testnet node
|
||||
```
|
||||
or
|
||||
```bash
|
||||
$ ./ec2.sh config -p master-testnet-solana-com
|
||||
```
|
||||
|
||||
Then run the following for details on how to ssh into any testnet node
|
||||
```bash
|
||||
$ ./ssh.sh
|
||||
```
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
gce.sh
|
204
net/gce.sh
204
net/gce.sh
|
@ -1,27 +1,44 @@
|
|||
#!/bin/bash -e
|
||||
|
||||
here=$(dirname "$0")
|
||||
# shellcheck source=net/scripts/gcloud.sh
|
||||
source "$here"/scripts/gcloud.sh
|
||||
# shellcheck source=net/common.sh
|
||||
source "$here"/common.sh
|
||||
|
||||
cloudProvider=$(basename "$0" .sh)
|
||||
case $cloudProvider in
|
||||
gce)
|
||||
# shellcheck source=net/scripts/gce-provider.sh
|
||||
source "$here"/scripts/gce-provider.sh
|
||||
|
||||
imageName="ubuntu-16-04-cuda-9-2-new"
|
||||
leaderMachineType=n1-standard-16
|
||||
validatorMachineType=n1-standard-4
|
||||
clientMachineType=n1-standard-16
|
||||
;;
|
||||
ec2)
|
||||
# shellcheck source=net/scripts/ec2-provider.sh
|
||||
source "$here"/scripts/ec2-provider.sh
|
||||
|
||||
imageName="ami-04169656fea786776"
|
||||
leaderMachineType=m4.4xlarge
|
||||
validatorMachineType=m4.xlarge
|
||||
clientMachineType=m4.4xlarge
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unknown cloud provider: $cloudProvider"
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
|
||||
validatorNodeCount=5
|
||||
clientNodeCount=1
|
||||
leaderBootDiskSize=1TB
|
||||
leaderMachineType=n1-standard-16
|
||||
leaderAccelerator=
|
||||
validatorMachineType=n1-standard-4
|
||||
validatorBootDiskSize=$leaderBootDiskSize
|
||||
validatorAccelerator=
|
||||
clientMachineType=n1-standard-16
|
||||
clientBootDiskSize=40GB
|
||||
clientAccelerator=
|
||||
leaderBootDiskSizeInGb=1000
|
||||
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
|
||||
clientBootDiskSizeInGb=40
|
||||
|
||||
imageName="ubuntu-16-04-cuda-9-2-new"
|
||||
publicNetwork=false
|
||||
zone="us-west1-b"
|
||||
enableGpu=false
|
||||
leaderAddress=
|
||||
|
||||
usage() {
|
||||
|
@ -33,7 +50,7 @@ usage() {
|
|||
cat <<EOF
|
||||
usage: $0 [create|config|delete] [common options] [command-specific options]
|
||||
|
||||
Configure a GCE-based testnet
|
||||
Manage testnet instances
|
||||
|
||||
create - create a new testnet (implies 'config')
|
||||
config - configure the testnet and write a config file describing it
|
||||
|
@ -47,10 +64,13 @@ Configure a GCE-based testnet
|
|||
-n [number] - Number of validator nodes (default: $validatorNodeCount)
|
||||
-c [number] - Number of client nodes (default: $clientNodeCount)
|
||||
-P - Use public network IP addresses (default: $publicNetwork)
|
||||
-z [zone] - GCP Zone for the nodes (default: $zone)
|
||||
-i [imageName] - Existing image on GCE (default: $imageName)
|
||||
-g - Enable GPU
|
||||
-a [address] - Set the leader node's external IP address to this GCE address
|
||||
-z [zone] - Zone for the nodes (default: $zone)
|
||||
-g - Enable GPU (default: $enableGpu)
|
||||
-a [address] - Set the leader node's external IP address to this value.
|
||||
For GCE, [address] is the "name" of the desired External
|
||||
IP Address.
|
||||
For EC2, [address] is the "allocation ID" of the desired
|
||||
Elastic IP.
|
||||
|
||||
config-specific options:
|
||||
none
|
||||
|
@ -68,7 +88,7 @@ command=$1
|
|||
shift
|
||||
[[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command"
|
||||
|
||||
while getopts "h?p:Pi:n:c:z:ga:" opt; do
|
||||
while getopts "h?p:Pn:c:z:ga:" opt; do
|
||||
case $opt in
|
||||
h | \?)
|
||||
usage
|
||||
|
@ -80,9 +100,6 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
|
|||
P)
|
||||
publicNetwork=true
|
||||
;;
|
||||
i)
|
||||
imageName=$OPTARG
|
||||
;;
|
||||
n)
|
||||
validatorNodeCount=$OPTARG
|
||||
;;
|
||||
|
@ -90,10 +107,10 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
|
|||
clientNodeCount=$OPTARG
|
||||
;;
|
||||
z)
|
||||
zone=$OPTARG
|
||||
cloud_SetZone "$OPTARG"
|
||||
;;
|
||||
g)
|
||||
leaderAccelerator="count=4,type=nvidia-tesla-k80"
|
||||
enableGpu=true
|
||||
;;
|
||||
a)
|
||||
leaderAddress=$OPTARG
|
||||
|
@ -108,6 +125,37 @@ shift $((OPTIND - 1))
|
|||
[[ -z $1 ]] || usage "Unexpected argument: $1"
|
||||
sshPrivateKey="$netConfigDir/id_$prefix"
|
||||
|
||||
|
||||
# cloud_ForEachInstance [cmd] [extra args to cmd]
|
||||
#
|
||||
# Execute a command for each element in the `instances` array
|
||||
#
|
||||
# cmd - The command to execute on each instance
|
||||
# The command will receive arguments followed by any
|
||||
# additionl arguments supplied to cloud_ForEachInstance:
|
||||
# name - name of the instance
|
||||
# publicIp - The public IP address of this instance
|
||||
# privateIp - The priate IP address of this instance
|
||||
# count - Monotonically increasing count for each
|
||||
# invocation of cmd, starting at 1
|
||||
# ... - Extra args to cmd..
|
||||
#
|
||||
#
|
||||
cloud_ForEachInstance() {
|
||||
declare cmd="$1"
|
||||
shift
|
||||
[[ -n $cmd ]] || { echo cloud_ForEachInstance: cmd not specified; exit 1; }
|
||||
|
||||
declare count=1
|
||||
for info in "${instances[@]}"; do
|
||||
declare name publicIp privateIp
|
||||
IFS=: read -r name publicIp privateIp < <(echo "$info")
|
||||
|
||||
eval "$cmd" "$name" "$publicIp" "$privateIp" "$count" "$@"
|
||||
count=$((count + 1))
|
||||
done
|
||||
}
|
||||
|
||||
prepareInstancesAndWriteConfigFile() {
|
||||
$metricsWriteDatapoint "testnet-deploy net-config-begin=1"
|
||||
|
||||
|
@ -122,10 +170,10 @@ EOF
|
|||
|
||||
recordInstanceIp() {
|
||||
declare name="$1"
|
||||
declare publicIp="$3"
|
||||
declare privateIp="$4"
|
||||
declare publicIp="$2"
|
||||
declare privateIp="$3"
|
||||
|
||||
declare arrayName="$6"
|
||||
declare arrayName="$5"
|
||||
|
||||
echo "$arrayName+=($publicIp) # $name" >> "$configFile"
|
||||
if [[ $arrayName = "leaderIp" ]]; then
|
||||
|
@ -139,121 +187,133 @@ EOF
|
|||
|
||||
waitForStartupComplete() {
|
||||
declare name="$1"
|
||||
declare publicIp="$3"
|
||||
declare publicIp="$2"
|
||||
|
||||
echo "Waiting for $name to finish booting..."
|
||||
(
|
||||
for i in $(seq 1 30); do
|
||||
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then
|
||||
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.instance-startup-complete"); then
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
echo "Retry $i..."
|
||||
done
|
||||
)
|
||||
echo "$name has booted."
|
||||
}
|
||||
|
||||
echo "Looking for leader instance..."
|
||||
gcloud_FindInstances "name=$prefix-leader" show
|
||||
cloud_FindInstance "$prefix-leader"
|
||||
[[ ${#instances[@]} -eq 1 ]] || {
|
||||
echo "Unable to find leader"
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "Fetching $sshPrivateKey from $leaderName"
|
||||
(
|
||||
rm -rf "$sshPrivateKey"{,pub}
|
||||
|
||||
declare leaderName
|
||||
declare leaderZone
|
||||
declare leaderIp
|
||||
IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}")
|
||||
IFS=: read -r leaderName leaderIp _ < <(echo "${instances[0]}")
|
||||
|
||||
set -x
|
||||
# Try to ping the machine first.
|
||||
timeout 60s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
|
||||
|
||||
# Try to ping the machine first. There can be a delay between when the
|
||||
# instance is reported as RUNNING and when it's reachable over the network
|
||||
timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
|
||||
if [[ ! -r $sshPrivateKey ]]; then
|
||||
echo "Fetching $sshPrivateKey from $leaderName"
|
||||
|
||||
# Try to scp in a couple times, sshd may not yet be up even though the
|
||||
# machine can be pinged...
|
||||
set -o pipefail
|
||||
for i in $(seq 1 10); do
|
||||
if gcloud compute scp --zone "$leaderZone" \
|
||||
"$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then
|
||||
set -x -o pipefail
|
||||
for i in $(seq 1 30); do
|
||||
if cloud_FetchFile "$leaderName" "$leaderIp" /solana-id_ecdsa "$sshPrivateKey"; then
|
||||
break
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
echo "Retry $i..."
|
||||
done
|
||||
|
||||
chmod 400 "$sshPrivateKey"
|
||||
ls -l "$sshPrivateKey"
|
||||
fi
|
||||
)
|
||||
|
||||
echo "leaderIp=()" >> "$configFile"
|
||||
gcloud_ForEachInstance recordInstanceIp leaderIp
|
||||
gcloud_ForEachInstance waitForStartupComplete
|
||||
cloud_ForEachInstance recordInstanceIp leaderIp
|
||||
cloud_ForEachInstance waitForStartupComplete
|
||||
|
||||
echo "Looking for validator instances..."
|
||||
gcloud_FindInstances "name~^$prefix-validator" show
|
||||
cloud_FindInstances "$prefix-validator"
|
||||
[[ ${#instances[@]} -gt 0 ]] || {
|
||||
echo "Unable to find validators"
|
||||
exit 1
|
||||
}
|
||||
echo "validatorIpList=()" >> "$configFile"
|
||||
gcloud_ForEachInstance recordInstanceIp validatorIpList
|
||||
gcloud_ForEachInstance waitForStartupComplete
|
||||
cloud_ForEachInstance recordInstanceIp validatorIpList
|
||||
cloud_ForEachInstance waitForStartupComplete
|
||||
|
||||
echo "clientIpList=()" >> "$configFile"
|
||||
echo "Looking for client instances..."
|
||||
gcloud_FindInstances "name~^$prefix-client" show
|
||||
cloud_FindInstances "$prefix-client"
|
||||
[[ ${#instances[@]} -eq 0 ]] || {
|
||||
gcloud_ForEachInstance recordInstanceIp clientIpList
|
||||
gcloud_ForEachInstance waitForStartupComplete
|
||||
cloud_ForEachInstance recordInstanceIp clientIpList
|
||||
cloud_ForEachInstance waitForStartupComplete
|
||||
}
|
||||
|
||||
echo "Wrote $configFile"
|
||||
$metricsWriteDatapoint "testnet-deploy net-config-complete=1"
|
||||
}
|
||||
|
||||
case $command in
|
||||
delete)
|
||||
delete() {
|
||||
$metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
|
||||
|
||||
# Delete the leader node first to prevent unusual metrics on the dashboard
|
||||
# during shutdown.
|
||||
# TODO: It would be better to fully cut-off metrics reporting before any
|
||||
# instances are deleted.
|
||||
for filter in "^$prefix-leader" "^$prefix-"; do
|
||||
gcloud_FindInstances "name~$filter"
|
||||
for filter in "$prefix-leader" "$prefix-"; do
|
||||
echo "Searching for instances: $filter"
|
||||
cloud_FindInstances "$filter"
|
||||
|
||||
if [[ ${#instances[@]} -eq 0 ]]; then
|
||||
echo "No instances found matching '$filter'"
|
||||
else
|
||||
gcloud_DeleteInstances true
|
||||
cloud_DeleteInstances true
|
||||
fi
|
||||
done
|
||||
rm -f "$configFile"
|
||||
|
||||
$metricsWriteDatapoint "testnet-deploy net-delete-complete=1"
|
||||
|
||||
}
|
||||
|
||||
case $command in
|
||||
delete)
|
||||
delete
|
||||
;;
|
||||
|
||||
create)
|
||||
[[ -n $validatorNodeCount ]] || usage "Need number of nodes"
|
||||
if [[ $validatorNodeCount -le 0 ]]; then
|
||||
usage "One or more validator nodes is required"
|
||||
fi
|
||||
|
||||
delete
|
||||
|
||||
$metricsWriteDatapoint "testnet-deploy net-create-begin=1"
|
||||
|
||||
rm -rf "$sshPrivateKey"{,.pub}
|
||||
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
|
||||
|
||||
# Note: using rsa because |aws ec2 import-key-pair| seems to fail for ecdsa
|
||||
ssh-keygen -t rsa -N '' -f "$sshPrivateKey"
|
||||
|
||||
printNetworkInfo() {
|
||||
cat <<EOF
|
||||
========================================================================================
|
||||
|
||||
Network composition:
|
||||
Leader = $leaderMachineType (GPU=${leaderAccelerator:-none})
|
||||
Validators = $validatorNodeCount x $validatorMachineType (GPU=${validatorAccelerator:-none})
|
||||
Client(s) = $clientNodeCount x $clientMachineType (GPU=${clientAccelerator:-none})
|
||||
Leader = $leaderMachineType (GPU=$enableGpu)
|
||||
Validators = $validatorNodeCount x $validatorMachineType
|
||||
Client(s) = $clientNodeCount x $clientMachineType
|
||||
|
||||
========================================================================================
|
||||
|
||||
|
@ -261,7 +321,7 @@ EOF
|
|||
}
|
||||
printNetworkInfo
|
||||
|
||||
declare startupScript="$netConfigDir"/gce-startup-script.sh
|
||||
declare startupScript="$netConfigDir"/instance-startup-script.sh
|
||||
cat > "$startupScript" <<EOF
|
||||
#!/bin/bash -ex
|
||||
# autogenerated at $(date)
|
||||
|
@ -270,11 +330,12 @@ cat > /etc/motd <<EOM
|
|||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
This instance has not been fully configured.
|
||||
See "startup-script" log messages in /var/log/syslog for status:
|
||||
$ sudo cat /var/log/syslog | grep startup-script
|
||||
|
||||
See startup script log messages in /var/log/syslog for status:
|
||||
$ sudo cat /var/log/syslog | egrep \\(startup-script\\|cloud-init\)
|
||||
|
||||
To block until setup is complete, run:
|
||||
$ until [[ -f /.gce-startup-complete ]]; do sleep 1; done
|
||||
$ until [[ -f /.instance-startup-complete ]]; do sleep 1; done
|
||||
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
EOM
|
||||
|
@ -296,6 +357,7 @@ $(
|
|||
cat \
|
||||
disable-background-upgrades.sh \
|
||||
create-solana-user.sh \
|
||||
add-solana-user-authorized_keys.sh \
|
||||
install-earlyoom.sh \
|
||||
install-libssl-compatability.sh \
|
||||
install-rsync.sh \
|
||||
|
@ -305,21 +367,21 @@ cat > /etc/motd <<EOM
|
|||
$(printNetworkInfo)
|
||||
EOM
|
||||
|
||||
touch /.gce-startup-complete
|
||||
touch /.instance-startup-complete
|
||||
|
||||
EOF
|
||||
|
||||
gcloud_CreateInstances "$prefix-leader" 1 "$zone" \
|
||||
"$imageName" "$leaderMachineType" "$leaderBootDiskSize" "$leaderAccelerator" \
|
||||
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
|
||||
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
|
||||
"$startupScript" "$leaderAddress"
|
||||
|
||||
gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" \
|
||||
"$imageName" "$validatorMachineType" "$validatorBootDiskSize" "$validatorAccelerator" \
|
||||
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
|
||||
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
|
||||
"$startupScript" ""
|
||||
|
||||
if [[ $clientNodeCount -gt 0 ]]; then
|
||||
gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" \
|
||||
"$imageName" "$clientMachineType" "$clientBootDiskSize" "$clientAccelerator" \
|
||||
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
|
||||
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
|
||||
"$startupScript" ""
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
#!/bin/bash -ex
|
||||
|
||||
[[ $(uname) = Linux ]] || exit 1
|
||||
[[ $USER = root ]] || exit 1
|
||||
|
||||
[[ -d /home/solana/.ssh ]] || exit 1
|
||||
|
||||
# /solana-authorized_keys contains the public keys for users that should
|
||||
# automatically be granted access to ALL testnets.
|
||||
#
|
||||
# To add an entry into this list:
|
||||
# 1. Run: ssh-keygen -t ecdsa -N '' -f ~/.ssh/id-solana-testnet
|
||||
# 2. Inline ~/.ssh/id-solana-testnet.pub below
|
||||
cat > /solana-authorized_keys <<EOF
|
||||
ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFBNwLw0i+rI312gWshojFlNw9NV7WfaKeeUsYADqOvM2o4yrO2pPw+sgW8W+/rPpVyH7zU9WVRgTME8NgFV1Vc=
|
||||
EOF
|
||||
|
||||
sudo -u solana bash -c "
|
||||
cat /solana-authorized_keys >> /home/solana/.ssh/authorized_keys
|
||||
"
|
|
@ -0,0 +1,242 @@
|
|||
# |source| this file
|
||||
#
|
||||
# Utilities for working with EC2 instances
|
||||
#
|
||||
|
||||
zone=
|
||||
region=
|
||||
|
||||
cloud_SetZone() {
|
||||
zone="$1"
|
||||
# AWS region is zone with the last character removed
|
||||
region="${zone:0:$((${#zone} - 1))}"
|
||||
}
|
||||
|
||||
# Set the default zone
|
||||
cloud_SetZone "us-east-1b"
|
||||
|
||||
# sshPrivateKey should be globally defined whenever this function is called.
|
||||
#
|
||||
# TODO: Remove usage of the sshPrivateKey global
|
||||
__cloud_SshPrivateKeyCheck() {
|
||||
# shellcheck disable=SC2154
|
||||
if [[ -z $sshPrivateKey ]]; then
|
||||
echo Error: sshPrivateKey not defined
|
||||
exit 1
|
||||
fi
|
||||
if [[ ! -r $sshPrivateKey ]]; then
|
||||
echo "Error: file is not readable: $sshPrivateKey"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# __cloud_FindInstances
|
||||
#
|
||||
# Find instances with name matching the specified pattern.
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:public IP:private IP"
|
||||
#
|
||||
# filter - The instances to filter on
|
||||
#
|
||||
# examples:
|
||||
# $ __cloud_FindInstances "exact-machine-name"
|
||||
# $ __cloud_FindInstances "all-machines-with-a-common-machine-prefix*"
|
||||
#
|
||||
__cloud_FindInstances() {
|
||||
declare filter="$1"
|
||||
|
||||
instances=()
|
||||
declare name publicIp privateIp
|
||||
while read -r name publicIp privateIp; do
|
||||
printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
|
||||
instances+=("$name:$publicIp:$privateIp")
|
||||
done < <(aws ec2 describe-instances \
|
||||
--region "$region" \
|
||||
--filters \
|
||||
"Name=tag:name,Values=$filter" \
|
||||
"Name=instance-state-name,Values=pending,running" \
|
||||
--query "Reservations[].Instances[].[InstanceId,PublicIpAddress,PrivateIpAddress]" \
|
||||
--output text
|
||||
)
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_FindInstances [namePrefix]
|
||||
#
|
||||
# Find instances with names matching the specified prefix
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:public IP:private IP"
|
||||
#
|
||||
# namePrefix - The instance name prefix to look for
|
||||
#
|
||||
# examples:
|
||||
# $ cloud_FindInstances all-machines-with-a-common-machine-prefix
|
||||
#
|
||||
cloud_FindInstances() {
|
||||
declare namePrefix="$1"
|
||||
__cloud_FindInstances "$namePrefix*"
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_FindInstance [name]
|
||||
#
|
||||
# Find an instance with a name matching the exact pattern.
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:public IP:private IP"
|
||||
#
|
||||
# name - The instance name to look for
|
||||
#
|
||||
# examples:
|
||||
# $ cloud_FindInstance exact-machine-name
|
||||
#
|
||||
cloud_FindInstance() {
|
||||
declare name="$1"
|
||||
__cloud_FindInstances "$name"
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
|
||||
# [machineType] [bootDiskSize] [enableGpu]
|
||||
# [startupScript] [address]
|
||||
#
|
||||
# Creates one more identical instances.
|
||||
#
|
||||
# networkName - unique name of this testnet
|
||||
# namePrefix - unique string to prefix all the instance names with
|
||||
# numNodes - number of instances to create
|
||||
# imageName - Disk image for the instances
|
||||
# machineType - GCE machine type
|
||||
# bootDiskSize - Optional size of the boot disk in GB
|
||||
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||
# startupScript - Optional startup script to execute when the instance boots
|
||||
# address - Optional name of the GCE static IP address to attach to the
|
||||
# instance. Requires that |numNodes| = 1 and that addressName
|
||||
# has been provisioned in the GCE region that is hosting |zone|
|
||||
#
|
||||
# Tip: use cloud_FindInstances to locate the instances once this function
|
||||
# returns
|
||||
cloud_CreateInstances() {
|
||||
declare networkName="$1"
|
||||
declare namePrefix="$2"
|
||||
declare numNodes="$3"
|
||||
declare imageName="$4"
|
||||
declare machineType="$5"
|
||||
declare optionalBootDiskSize="$6"
|
||||
declare optionalGpu="$7"
|
||||
declare optionalStartupScript="$8"
|
||||
declare optionalAddress="$9"
|
||||
|
||||
__cloud_SshPrivateKeyCheck
|
||||
(
|
||||
set -x
|
||||
aws ec2 delete-key-pair --region "$region" --key-name "$networkName"
|
||||
aws ec2 import-key-pair --region "$region" --key-name "$networkName" \
|
||||
--public-key-material file://"${sshPrivateKey}".pub
|
||||
)
|
||||
|
||||
declare -a args
|
||||
args=(
|
||||
--key-name "$networkName"
|
||||
--count "$numNodes"
|
||||
--region "$region"
|
||||
--placement "AvailabilityZone=$zone"
|
||||
--security-groups testnet
|
||||
--image-id "$imageName"
|
||||
--instance-type "$machineType"
|
||||
--tag-specifications "ResourceType=instance,Tags=[{Key=name,Value=$namePrefix}]"
|
||||
)
|
||||
if [[ -n $optionalBootDiskSize ]]; then
|
||||
args+=(
|
||||
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
|
||||
)
|
||||
fi
|
||||
if [[ $optionalGpu = true ]]; then
|
||||
echo TODO: GPU support not implemented yet
|
||||
exit 1
|
||||
fi
|
||||
if [[ -n $optionalStartupScript ]]; then
|
||||
args+=(
|
||||
--user-data "file://$optionalStartupScript"
|
||||
)
|
||||
fi
|
||||
|
||||
if [[ -n $optionalAddress ]]; then
|
||||
[[ $numNodes = 1 ]] || {
|
||||
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
|
||||
exit 1
|
||||
}
|
||||
fi
|
||||
|
||||
(
|
||||
set -x
|
||||
aws ec2 run-instances "${args[@]}"
|
||||
)
|
||||
|
||||
if [[ -n $optionalAddress ]]; then
|
||||
cloud_FindInstance "$namePrefix"
|
||||
if [[ ${#instances[@]} -ne 1 ]]; then
|
||||
echo "Failed to find newly created instance: $namePrefix"
|
||||
fi
|
||||
|
||||
declare instanceId
|
||||
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
|
||||
aws ec2 associate-address \
|
||||
--instance-id "$instanceId" \
|
||||
--region "region" \
|
||||
--allocation-id "$optionalAddress"
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_DeleteInstances
|
||||
#
|
||||
# Deletes all the instances listed in the `instances` array
|
||||
#
|
||||
cloud_DeleteInstances() {
|
||||
if [[ ${#instances[0]} -eq 0 ]]; then
|
||||
echo No instances to delete
|
||||
return
|
||||
fi
|
||||
declare names=("${instances[@]/:*/}")
|
||||
(
|
||||
set -x
|
||||
aws ec2 terminate-instances --region "$region" --instance-ids "${names[@]}"
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
|
||||
#
|
||||
# Fetch a file from the given instance. This function uses a cloud-specific
|
||||
# mechanism to fetch the file
|
||||
#
|
||||
cloud_FetchFile() {
|
||||
# shellcheck disable=SC2034 # instanceName is unused
|
||||
declare instanceName="$1"
|
||||
declare publicIp="$2"
|
||||
declare remoteFile="$3"
|
||||
declare localFile="$4"
|
||||
|
||||
__cloud_SshPrivateKeyCheck
|
||||
(
|
||||
set -x
|
||||
scp \
|
||||
-o "StrictHostKeyChecking=no" \
|
||||
-o "UserKnownHostsFile=/dev/null" \
|
||||
-o "User=solana" \
|
||||
-o "IdentityFile=$sshPrivateKey" \
|
||||
-o "LogLevel=ERROR" \
|
||||
-F /dev/null \
|
||||
"solana@$publicIp:$remoteFile" "$localFile"
|
||||
)
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
# |source| this file
|
||||
#
|
||||
# Utilities for working with GCE instances
|
||||
#
|
||||
|
||||
# Default zone
|
||||
zone="us-west1-b"
|
||||
cloud_SetZone() {
|
||||
zone="$1"
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# __cloud_FindInstances
|
||||
#
|
||||
# Find instances matching the specified pattern.
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:zone:public IP:private IP"
|
||||
#
|
||||
# filter - The instances to filter on
|
||||
#
|
||||
# examples:
|
||||
# $ __cloud_FindInstances "name=exact-machine-name"
|
||||
# $ __cloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
|
||||
#
|
||||
__cloud_FindInstances() {
|
||||
declare filter="$1"
|
||||
instances=()
|
||||
|
||||
declare name zone publicIp privateIp status
|
||||
while read -r name publicIp privateIp status; do
|
||||
if [[ $status != RUNNING ]]; then
|
||||
echo "Warning: $name is not RUNNING, ignoring it."
|
||||
continue
|
||||
fi
|
||||
printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
|
||||
|
||||
instances+=("$name:$publicIp:$privateIp")
|
||||
done < <(gcloud compute instances list \
|
||||
--filter="$filter" \
|
||||
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
|
||||
}
|
||||
#
|
||||
# cloud_FindInstances [namePrefix]
|
||||
#
|
||||
# Find instances with names matching the specified prefix
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:public IP:private IP"
|
||||
#
|
||||
# namePrefix - The instance name prefix to look for
|
||||
#
|
||||
# examples:
|
||||
# $ cloud_FindInstances all-machines-with-a-common-machine-prefix
|
||||
#
|
||||
cloud_FindInstances() {
|
||||
declare namePrefix="$1"
|
||||
__cloud_FindInstances "name~^$namePrefix"
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_FindInstance [name]
|
||||
#
|
||||
# Find an instance with a name matching the exact pattern.
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:public IP:private IP"
|
||||
#
|
||||
# name - The instance name to look for
|
||||
#
|
||||
# examples:
|
||||
# $ cloud_FindInstance exact-machine-name
|
||||
#
|
||||
cloud_FindInstance() {
|
||||
declare name="$1"
|
||||
__cloud_FindInstances "name=$name"
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
|
||||
# [machineType] [bootDiskSize] [enableGpu]
|
||||
# [startupScript] [address]
|
||||
#
|
||||
# Creates one more identical instances.
|
||||
#
|
||||
# networkName - unique name of this testnet
|
||||
# namePrefix - unique string to prefix all the instance names with
|
||||
# numNodes - number of instances to create
|
||||
# imageName - Disk image for the instances
|
||||
# machineType - GCE machine type
|
||||
# bootDiskSize - Optional size of the boot disk in GB
|
||||
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||
# startupScript - Optional startup script to execute when the instance boots
|
||||
# address - Optional name of the GCE static IP address to attach to the
|
||||
# instance. Requires that |numNodes| = 1 and that addressName
|
||||
# has been provisioned in the GCE region that is hosting `$zone`
|
||||
#
|
||||
# Tip: use cloud_FindInstances to locate the instances once this function
|
||||
# returns
|
||||
cloud_CreateInstances() {
|
||||
declare networkName="$1"
|
||||
declare namePrefix="$2"
|
||||
declare numNodes="$3"
|
||||
declare imageName="$4"
|
||||
declare machineType="$5"
|
||||
declare optionalBootDiskSize="$6"
|
||||
declare optionalGpu="$7"
|
||||
declare optionalStartupScript="$8"
|
||||
declare optionalAddress="$9"
|
||||
|
||||
declare nodes
|
||||
if [[ $numNodes = 1 ]]; then
|
||||
nodes=("$namePrefix")
|
||||
else
|
||||
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
|
||||
fi
|
||||
|
||||
declare -a args
|
||||
args=(
|
||||
"--zone=$zone"
|
||||
"--tags=testnet"
|
||||
"--metadata=testnet=$networkName"
|
||||
"--image=$imageName"
|
||||
"--machine-type=$machineType"
|
||||
)
|
||||
if [[ -n $optionalBootDiskSize ]]; then
|
||||
args+=(
|
||||
"--boot-disk-size=${optionalBootDiskSize}GB"
|
||||
)
|
||||
fi
|
||||
if [[ $optionalGpu = true ]]; then
|
||||
args+=(
|
||||
"--accelerator=count=4,type=nvidia-tesla-k80"
|
||||
--maintenance-policy TERMINATE
|
||||
--restart-on-failure
|
||||
)
|
||||
fi
|
||||
if [[ -n $optionalStartupScript ]]; then
|
||||
args+=(
|
||||
--metadata-from-file "startup-script=$optionalStartupScript"
|
||||
)
|
||||
fi
|
||||
|
||||
if [[ -n $optionalAddress ]]; then
|
||||
[[ $numNodes = 1 ]] || {
|
||||
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
|
||||
exit 1
|
||||
}
|
||||
args+=(
|
||||
"--address=$optionalAddress"
|
||||
)
|
||||
fi
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
|
||||
)
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_DeleteInstances
|
||||
#
|
||||
# Deletes all the instances listed in the `instances` array
|
||||
#
|
||||
cloud_DeleteInstances() {
|
||||
if [[ ${#instances[0]} -eq 0 ]]; then
|
||||
echo No instances to delete
|
||||
return
|
||||
fi
|
||||
declare names=("${instances[@]/:*/}")
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud beta compute instances delete --zone "$zone" --quiet "${names[@]}"
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
|
||||
#
|
||||
# Fetch a file from the given instance. This function uses a cloud-specific
|
||||
# mechanism to fetch the file
|
||||
#
|
||||
cloud_FetchFile() {
|
||||
declare instanceName="$1"
|
||||
# shellcheck disable=SC2034 # publicIp is unused
|
||||
declare publicIp="$2"
|
||||
declare remoteFile="$3"
|
||||
declare localFile="$4"
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud compute scp --zone "$zone" "$instanceName:$remoteFile" "$localFile"
|
||||
)
|
||||
}
|
|
@ -1,187 +0,0 @@
|
|||
# |source| this file
|
||||
#
|
||||
# Utilities for working with gcloud
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# gcloud_FindInstances [filter] [options]
|
||||
#
|
||||
# Find instances matching the specified pattern.
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:zone:public IP:private IP"
|
||||
#
|
||||
# filter - The instances to filter on
|
||||
# options - If set to the string "show", the list of instances will be echoed
|
||||
# to stdout
|
||||
#
|
||||
# examples:
|
||||
# $ gcloud_FindInstances "name=exact-machine-name"
|
||||
# $ gcloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
|
||||
#
|
||||
gcloud_FindInstances() {
|
||||
declare filter="$1"
|
||||
declare options="$2"
|
||||
instances=()
|
||||
|
||||
declare name zone publicIp privateIp status
|
||||
while read -r name zone publicIp privateIp status; do
|
||||
if [[ $status != RUNNING ]]; then
|
||||
echo "Warning: $name is not RUNNING, ignoring it."
|
||||
continue
|
||||
fi
|
||||
if [[ $options = show ]]; then
|
||||
printf "%-30s | %-16s publicIp=%-16s privateIp=%s\n" "$name" "$zone" "$publicIp" "$privateIp"
|
||||
fi
|
||||
|
||||
instances+=("$name:$zone:$publicIp:$privateIp")
|
||||
done < <(gcloud compute instances list \
|
||||
--filter="$filter" \
|
||||
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_ForEachInstance [cmd] [extra args to cmd]
|
||||
#
|
||||
# Execute a command for each element in the `instances` array
|
||||
#
|
||||
# cmd - The command to execute on each instance
|
||||
# The command will receive arguments followed by any
|
||||
# additionl arguments supplied to gcloud_ForEachInstance:
|
||||
# name - name of the instance
|
||||
# zone - zone the instance is located in
|
||||
# publicIp - The public IP address of this instance
|
||||
# privateIp - The priate IP address of this instance
|
||||
# count - Monotonically increasing count for each
|
||||
# invocation of cmd, starting at 1
|
||||
# ... - Extra args to cmd..
|
||||
#
|
||||
#
|
||||
gcloud_ForEachInstance() {
|
||||
declare cmd="$1"
|
||||
shift
|
||||
[[ -n $cmd ]] || { echo gcloud_ForEachInstance: cmd not specified; exit 1; }
|
||||
|
||||
declare count=1
|
||||
for info in "${instances[@]}"; do
|
||||
declare name zone publicIp privateIp
|
||||
IFS=: read -r name zone publicIp privateIp < <(echo "$info")
|
||||
|
||||
eval "$cmd" "$name" "$zone" "$publicIp" "$privateIp" "$count" "$@"
|
||||
count=$((count + 1))
|
||||
done
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_CreateInstances [namePrefix] [numNodes] [zone] [imageName]
|
||||
# [machineType] [bootDiskSize] [accelerator]
|
||||
# [startupScript] [address]
|
||||
#
|
||||
# Creates one more identical instances.
|
||||
#
|
||||
# namePrefix - unique string to prefix all the instance names with
|
||||
# numNodes - number of instances to create
|
||||
# zone - zone to create the instances in
|
||||
# imageName - Disk image for the instances
|
||||
# machineType - GCE machine type
|
||||
# bootDiskSize - Optional disk of the boot disk
|
||||
# accelerator - Optional accelerator to attach to the instance(s), see
|
||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||
# startupScript - Optional startup script to execute when the instance boots
|
||||
# address - Optional name of the GCE static IP address to attach to the
|
||||
# instance. Requires that |numNodes| = 1 and that addressName
|
||||
# has been provisioned in the GCE region that is hosting |zone|
|
||||
#
|
||||
# Tip: use gcloud_FindInstances to locate the instances once this function
|
||||
# returns
|
||||
gcloud_CreateInstances() {
|
||||
declare namePrefix="$1"
|
||||
declare numNodes="$2"
|
||||
declare zone="$3"
|
||||
declare imageName="$4"
|
||||
declare machineType="$5"
|
||||
declare optionalBootDiskSize="$6"
|
||||
declare optionalAccelerator="$7"
|
||||
declare optionalStartupScript="$8"
|
||||
declare optionalAddress="$9"
|
||||
|
||||
declare nodes
|
||||
if [[ $numNodes = 1 ]]; then
|
||||
nodes=("$namePrefix")
|
||||
else
|
||||
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
|
||||
fi
|
||||
|
||||
declare -a args
|
||||
args=(
|
||||
"--zone=$zone"
|
||||
"--tags=testnet"
|
||||
"--image=$imageName"
|
||||
"--machine-type=$machineType"
|
||||
)
|
||||
if [[ -n $optionalBootDiskSize ]]; then
|
||||
args+=(
|
||||
"--boot-disk-size=$optionalBootDiskSize"
|
||||
)
|
||||
fi
|
||||
if [[ -n $optionalAccelerator ]]; then
|
||||
args+=(
|
||||
"--accelerator=$optionalAccelerator"
|
||||
--maintenance-policy TERMINATE
|
||||
--restart-on-failure
|
||||
)
|
||||
fi
|
||||
if [[ -n $optionalStartupScript ]]; then
|
||||
args+=(
|
||||
--metadata-from-file "startup-script=$optionalStartupScript"
|
||||
)
|
||||
fi
|
||||
|
||||
if [[ -n $optionalAddress ]]; then
|
||||
[[ $numNodes = 1 ]] || {
|
||||
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
|
||||
exit 1
|
||||
}
|
||||
args+=(
|
||||
"--address=$optionalAddress"
|
||||
)
|
||||
fi
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
|
||||
)
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_DeleteInstances [yes]
|
||||
#
|
||||
# Deletes all the instances listed in the `instances` array
|
||||
#
|
||||
# If yes = "true", skip the delete confirmation
|
||||
#
|
||||
gcloud_DeleteInstances() {
|
||||
declare maybeQuiet=
|
||||
if [[ $1 = true ]]; then
|
||||
maybeQuiet=--quiet
|
||||
fi
|
||||
|
||||
if [[ ${#instances[0]} -eq 0 ]]; then
|
||||
echo No instances to delete
|
||||
return
|
||||
fi
|
||||
declare names=("${instances[@]/:*/}")
|
||||
|
||||
# Assume all instances are in the same zone
|
||||
# TODO: One day this assumption will be invalid
|
||||
declare zone
|
||||
IFS=: read -r _ zone _ < <(echo "${instances[0]}")
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud beta compute instances delete --zone "$zone" $maybeQuiet "${names[@]}"
|
||||
)
|
||||
}
|
||||
|
Loading…
Reference in New Issue