#!/bin/bash -e here=$(dirname "$0") # shellcheck source=net/common.sh source "$here"/common.sh cloudProvider=$(basename "$0" .sh) bootDiskType="" case $cloudProvider in gce) # shellcheck source=net/scripts/gce-provider.sh source "$here"/scripts/gce-provider.sh cpuImageName="ubuntu-1804-bionic-v20181029 --image-project ubuntu-os-cloud" # TODO: GPU image is still 16.04-based pending resolution of # https://github.com/solana-labs/solana/issues/1702 gpuImageName="ubuntu-16-04-cuda-9-2-new" imageName=$cpuImageName cpuLeaderMachineType=n1-standard-16 gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80" leaderMachineType=$cpuLeaderMachineType validatorMachineType=n1-standard-16 clientMachineType=n1-standard-16 ;; ec2) # shellcheck source=net/scripts/ec2-provider.sh source "$here"/scripts/ec2-provider.sh # Deep Learning AMI (Ubuntu 16.04-based) cpuImageName="ami-0466e26ccc0e752c1" gpuImageName="$cpuImageName" imageName=$cpuImageName cpuLeaderMachineType=m4.4xlarge gpuLeaderMachineType=p2.xlarge leaderMachineType=$cpuLeaderMachineType validatorMachineType=m4.4xlarge clientMachineType=m4.4xlarge ;; *) echo "Error: Unknown cloud provider: $cloudProvider" ;; esac prefix=testnet-dev-${USER//[^A-Za-z0-9]/} validatorNodeCount=5 clientNodeCount=1 leaderBootDiskSizeInGb=1000 validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb clientBootDiskSizeInGb=75 publicNetwork=false enableGpu=false leaderAddress= usage() { exitcode=0 if [[ -n "$1" ]]; then exitcode=1 echo "Error: $*" fi cat <> "$configFile" <> "$configFile" if [[ $arrayName = "leaderIp" ]]; then if $publicNetwork; then echo "entrypointIp=$publicIp" >> "$configFile" else echo "entrypointIp=$privateIp" >> "$configFile" fi fi } waitForStartupComplete() { declare name="$1" declare publicIp="$2" echo "Waiting for $name to finish booting..." ( for i in $(seq 1 30); do if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.instance-startup-complete"); then break fi sleep 2 echo "Retry $i..." done ) echo "$name has booted." } echo "Looking for leader instance..." cloud_FindInstance "$prefix-leader" [[ ${#instances[@]} -eq 1 ]] || { echo "Unable to find leader" exit 1 } ( declare leaderName declare leaderIp IFS=: read -r leaderName leaderIp _ < <(echo "${instances[0]}") # Try to ping the machine first. timeout 60s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done" if [[ ! -r $sshPrivateKey ]]; then echo "Fetching $sshPrivateKey from $leaderName" # Try to scp in a couple times, sshd may not yet be up even though the # machine can be pinged... set -x -o pipefail for i in $(seq 1 30); do if cloud_FetchFile "$leaderName" "$leaderIp" /solana-id_ecdsa "$sshPrivateKey"; then break fi sleep 1 echo "Retry $i..." done chmod 400 "$sshPrivateKey" ls -l "$sshPrivateKey" fi ) echo "leaderIp=()" >> "$configFile" cloud_ForEachInstance recordInstanceIp leaderIp cloud_ForEachInstance waitForStartupComplete echo "Looking for validator instances..." cloud_FindInstances "$prefix-validator" [[ ${#instances[@]} -gt 0 ]] || { echo "Unable to find validators" exit 1 } echo "validatorIpList=()" >> "$configFile" cloud_ForEachInstance recordInstanceIp validatorIpList cloud_ForEachInstance waitForStartupComplete echo "clientIpList=()" >> "$configFile" echo "Looking for client instances..." cloud_FindInstances "$prefix-client" [[ ${#instances[@]} -eq 0 ]] || { cloud_ForEachInstance recordInstanceIp clientIpList cloud_ForEachInstance waitForStartupComplete } echo "Wrote $configFile" $metricsWriteDatapoint "testnet-deploy net-config-complete=1" } delete() { $metricsWriteDatapoint "testnet-deploy net-delete-begin=1" # Delete the leader node first to prevent unusual metrics on the dashboard # during shutdown. # TODO: It would be better to fully cut-off metrics reporting before any # instances are deleted. for filter in "$prefix-leader" "$prefix-"; do echo "Searching for instances: $filter" cloud_FindInstances "$filter" if [[ ${#instances[@]} -eq 0 ]]; then echo "No instances found matching '$filter'" else cloud_DeleteInstances true fi done rm -f "$configFile" $metricsWriteDatapoint "testnet-deploy net-delete-complete=1" } case $command in delete) delete ;; create) [[ -n $validatorNodeCount ]] || usage "Need number of nodes" if [[ $validatorNodeCount -le 0 ]]; then usage "One or more validator nodes is required" fi delete $metricsWriteDatapoint "testnet-deploy net-create-begin=1" rm -rf "$sshPrivateKey"{,.pub} # Note: using rsa because |aws ec2 import-key-pair| seems to fail for ecdsa ssh-keygen -t rsa -N '' -f "$sshPrivateKey" printNetworkInfo() { cat < "$startupScript" < /etc/motd < /solana-id_ecdsa < /solana-id_ecdsa.pub < /etc/motd <