diff --git a/multinode-demo/gce_multinode.sh b/multinode-demo/gce_multinode.sh deleted file mode 100755 index 42fd00872d..0000000000 --- a/multinode-demo/gce_multinode.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash - -here=$(dirname "$0") -# shellcheck source=scripts/gcloud.sh -source "$here"/../scripts/gcloud.sh - -command=$1 -prefix= -num_nodes= -out_file= -image_name="ubuntu-16-04-cuda-9-2-new" -internalNetwork=false -zone="us-west1-b" - -shift - -usage() { - exitcode=0 - if [[ -n "$1" ]]; then - exitcode=1 - echo "Error: $*" - fi - cat < <-p prefix> <-n num_nodes> <-o file> [-i image-name] - -Manage a GCE multinode network - - create|delete - Create or delete the network - -p prefix - A common prefix for node names, to avoid collision - -n num_nodes - Number of nodes - -P - Use IP addresses on GCE internal/private network - -z - GCP Zone for the nodes (default $zone) - -o out_file - Used for create option. Outputs an array of IP addresses - of new nodes to the file - -i image_name - Existing image on GCE (default $image_name) - -EOF - exit $exitcode -} - -while getopts "h?p:Pi:n:z:o:" opt; do - case $opt in - h | \?) - usage - ;; - p) - prefix=$OPTARG - ;; - P) - internalNetwork=true - ;; - i) - image_name=$OPTARG - ;; - o) - out_file=$OPTARG - ;; - n) - num_nodes=$OPTARG - ;; - z) - zone=$OPTARG - ;; - *) - usage "Error: unhandled option: $opt" - ;; - esac -done - -set -e - -[[ -n $command ]] || usage "Need a command (create|delete)" - -[[ -n $prefix ]] || usage "Need a prefix for GCE instance names" - - -if [[ $command == "create" ]]; then - [[ -n $num_nodes ]] || usage "Need number of nodes" - [[ -n $out_file ]] || usage "Need an outfile to store IP Addresses" - - gcloud_CreateInstances "$prefix" "$num_nodes" "$zone" "$image_name" - gcloud_FindInstances "name~^$prefix" - - echo "ip_addr_array=()" > "$out_file" - recordPublicIp() { - declare name="$1" - declare publicIp="$3" - declare privateIp="$4" - - if $internalNetwork; then - echo "ip_addr_array+=($privateIp) # $name" >> "$out_file" - else - echo "ip_addr_array+=($publicIp) # $name" >> "$out_file" - fi - } - gcloud_ForEachInstance recordPublicIp - - echo "Instance ip addresses recorded in $out_file" -elif [[ $command == "delete" ]]; then - gcloud_FindInstances "name~^$prefix" - - if [[ ${#instances[@]} -eq 0 ]]; then - echo "No instances found matching '^$prefix'" - exit 0 - fi - gcloud_DeleteInstances -else - usage "Unknown command: $command" -fi diff --git a/multinode-demo/remote_leader.sh b/multinode-demo/remote_leader.sh deleted file mode 100755 index 1d4dda052b..0000000000 --- a/multinode-demo/remote_leader.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -e - -[[ -n $FORCE ]] || exit - -chmod 600 ~/.ssh/authorized_keys ~/.ssh/id_rsa - -PATH="$HOME"/.cargo/bin:"$PATH" - -./fetch-perf-libs.sh - -# Run setup -USE_INSTALL=1 ./multinode-demo/setup.sh -USE_INSTALL=1 ./multinode-demo/drone.sh >drone.log 2>&1 & -USE_INSTALL=1 SOLANA_CUDA=1 ./multinode-demo/leader.sh >leader.log 2>&1 & diff --git a/multinode-demo/remote_nodes.sh b/multinode-demo/remote_nodes.sh deleted file mode 100755 index 7c022e3a3f..0000000000 --- a/multinode-demo/remote_nodes.sh +++ /dev/null @@ -1,185 +0,0 @@ -#!/bin/bash - -command=$1 -ip_addr_file= -remote_user= -ssh_keys= - -shift - -usage() { - exitcode=0 - if [[ -n "$1" ]]; then - exitcode=1 - echo "Error: $*" - fi - cat < <-f IP Addr Array file> <-u username> [-k ssh-keys] - -Manage a GCE multinode network - - start|stop - Create or delete the network - -f file - A bash script that exports an array of IP addresses, ip_addr_array. - Elements of the array are public IP address of remote nodes. - -u username - The username for logging into remote nodes. - -k ssh-keys - Path to public/private key pair that remote nodes can use to perform - rsync and ssh among themselves. Must contain pub, and priv keys. - -EOF - exit $exitcode -} - -while getopts "h?f:u:k:" opt; do - case $opt in - h | \?) - usage - ;; - f) - ip_addr_file=$OPTARG - ;; - u) - remote_user=$OPTARG - ;; - k) - ssh_keys=$OPTARG - ;; - *) - usage "Error: unhandled option: $opt" - ;; - esac -done - -set -e - -# Sample IP Address array file contents -# ip_addr_array=(192.168.1.1 192.168.1.5 192.168.2.2) - -[[ -n $command ]] || usage "Need a command (start|stop)" -[[ -n $ip_addr_file ]] || usage "Need a file with IP address array" -[[ -n $remote_user ]] || usage "Need the username for remote nodes" - -ip_addr_array=() -# Get IP address array -# shellcheck source=/dev/null -source "$ip_addr_file" - -build_project() { - echo "Build started at $(date)" - SECONDS=0 - - # Build and install locally - PATH="$HOME"/.cargo/bin:"$PATH" - cargo install --force - - echo "Build took $SECONDS seconds" -} - -common_start_setup() { - ip_addr=$1 - - # Killing sshguard for now. TODO: Find a better solution - # sshguard is blacklisting IP address after ssh-keyscan and ssh login attempts - ssh "$remote_user@$ip_addr" " \ - set -ex; \ - sudo service sshguard stop; \ - sudo apt-get --assume-yes install rsync libssl-dev; \ - mkdir -p ~/.ssh ~/solana ~/.cargo/bin; \ - " >log/"$ip_addr".log - - # If provided, deploy SSH keys - if [[ -n $ssh_keys ]]; then - { - rsync -vPrz "$ssh_keys"/id_rsa "$remote_user@$ip_addr":~/.ssh/ - rsync -vPrz "$ssh_keys"/id_rsa.pub "$remote_user@$ip_addr":~/.ssh/ - rsync -vPrz "$ssh_keys"/id_rsa.pub "$remote_user@$ip_addr":~/.ssh/authorized_keys - rsync -vPrz ./multinode-demo "$remote_user@$ip_addr":~/solana/ - } >>log/"$ip_addr".log - fi -} - -start_leader() { - common_start_setup "$1" - - { - rsync -vPrz ~/.cargo/bin/solana* "$remote_user@$ip_addr":~/.cargo/bin/ - rsync -vPrz ./fetch-perf-libs.sh "$remote_user@$ip_addr":~/solana/ - ssh -n -f "$remote_user@$ip_addr" 'cd solana; FORCE=1 ./multinode-demo/remote_leader.sh' - } >>log/"$1".log - - leader_ip=$1 - leader_time=$SECONDS - SECONDS=0 -} - -start_validator() { - common_start_setup "$1" - - ssh -n -f "$remote_user@$ip_addr" "cd solana; FORCE=1 ./multinode-demo/remote_validator.sh $leader_ip" >>log/"$1".log -} - -start_all_nodes() { - echo "Deployment started at $(date)" - SECONDS=0 - count=0 - leader_ip= - leader_time= - - mkdir -p log - - for ip_addr in "${ip_addr_array[@]}"; do - if ((!count)); then - # Start the leader on the first node - echo "Leader node $ip_addr, killing previous instance and restarting" - start_leader "$ip_addr" - else - # Start validator on all other nodes - echo "Validator[$count] node $ip_addr, killing previous instance and restarting" - start_validator "$ip_addr" & - # TBD: Remove the sleep or reduce time once GCP login quota is increased - sleep 2 - fi - - ((count = count + 1)) - done - - wait - - ((validator_count = count - 1)) - - echo "Deployment finished at $(date)" - echo "Leader deployment too $leader_time seconds" - echo "$validator_count Validator deployment took $SECONDS seconds" -} - -stop_all_nodes() { - SECONDS=0 - local count=0 - for ip_addr in "${ip_addr_array[@]}"; do - ssh-keygen -R "$ip_addr" >log/local.log - ssh-keyscan "$ip_addr" >>~/.ssh/known_hosts 2>/dev/null - - echo "Stopping node[$count] $ip_addr. Remote user $remote_user" - - ssh -n -f "$remote_user@$ip_addr" " \ - set -ex; \ - sudo service sshguard stop; \ - pkill -9 solana-; \ - pkill -9 validator; \ - pkill -9 leader; \ - " - sleep 2 - ((count = count + 1)) - echo "Stopped node[$count] $ip_addr" - done - echo "Stopping $count nodes took $SECONDS seconds" -} - -if [[ $command == "start" ]]; then - build_project - stop_all_nodes - start_all_nodes -elif [[ $command == "stop" ]]; then - stop_all_nodes -else - usage "Unknown command: $command" -fi diff --git a/multinode-demo/remote_validator.sh b/multinode-demo/remote_validator.sh deleted file mode 100755 index cc764b3f58..0000000000 --- a/multinode-demo/remote_validator.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -e - -[[ -n $FORCE ]] || exit - -chmod 600 ~/.ssh/authorized_keys ~/.ssh/id_rsa - -PATH="$HOME"/.cargo/bin:"$PATH" - -touch ~/.ssh/known_hosts -ssh-keygen -R "$1" 2>/dev/null -ssh-keyscan "$1" >>~/.ssh/known_hosts 2>/dev/null - -rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/ - -# Run setup -USE_INSTALL=1 ./multinode-demo/setup.sh -USE_INSTALL=1 ./multinode-demo/validator.sh "$1":~/solana "$1" >validator.log 2>&1 diff --git a/net/.gitignore b/net/.gitignore new file mode 100644 index 0000000000..f877c3594f --- /dev/null +++ b/net/.gitignore @@ -0,0 +1,2 @@ +/config/ +/log/ diff --git a/net/README.md b/net/README.md new file mode 100644 index 0000000000..8949c6f6bc --- /dev/null +++ b/net/README.md @@ -0,0 +1,29 @@ + +# Network Management +This directory contains scripts useful for working with a test network. It's +intended to be both dev and CD friendly. + +### User Account Prerequisites + +Log in to GCP with: +```bash +$ gcloud auth login +``` + +Also ensure that `$(whoami)` is the name of an InfluxDB user account with enough +access to create a new database. + +You currently must be running on a Linux system (for now, TODO fix this) + +## Quick Start + +```bash +$ cd net/ + +$ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here) +$ ./init-metrics $(whoami) #<-- Configure a metrics database for the testnet +$ ./net.sh start #<-- Deploy the network from the local workspace +$ ./ssh.sh #<-- Details on how to ssh into any testnet node +$ ./gce.sh delete #<-- Dispose of the network (billing stops here) +``` + diff --git a/net/common.sh b/net/common.sh new file mode 100644 index 0000000000..b0c6d2df0b --- /dev/null +++ b/net/common.sh @@ -0,0 +1,41 @@ +# |source| this file +# +# Common utilities shared by other scripts in this directory +# +# The following directive disable complaints about unused variables in this +# file: +# shellcheck disable=2034 +# + +netConfigDir="$(dirname "${BASH_SOURCE[0]}")"/config +netLogDir="$(dirname "${BASH_SOURCE[0]}")"/log +mkdir -p "$netConfigDir" "$netLogDir" + +configFile="$netConfigDir/config.sh" + +clientIpList=() +leaderIp= +sshPrivateKey= +sshUsername= +sshOptions=() +validatorIpList=() + +loadConfigFile() { + [[ -r $configFile ]] || usage "Config file unreadable: $configFile" + + # shellcheck source=/dev/null + source "$configFile" + [[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile" + [[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile" + [[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile" + [[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile" + + sshOptions=( + -o "BatchMode=yes" + -o "StrictHostKeyChecking=no" + -o "UserKnownHostsFile=/dev/null" + -o "User=$sshUsername" + -o "IdentityFile=$sshPrivateKey" + -o "LogLevel=ERROR" + ) +} diff --git a/net/gce.sh b/net/gce.sh new file mode 100755 index 0000000000..601607f437 --- /dev/null +++ b/net/gce.sh @@ -0,0 +1,172 @@ +#!/bin/bash -e + +here=$(dirname "$0") +# shellcheck source=scripts/gcloud.sh +source "$here"/../scripts/gcloud.sh +# shellcheck source=net/common.sh +source "$here"/common.sh + +prefix=testnet-dev-$(whoami | sed -e s/[^a-z0-9].*//) +validatorNodeCount= +clientNodeCount= + +imageName="ubuntu-16-04-cuda-9-2-new" +internalNetwork=false +zone="us-west1-b" + +usage() { + exitcode=0 + if [[ -n "$1" ]]; then + exitcode=1 + echo "Error: $*" + fi + cat <> "$configFile" + + declare sshPrivateKey="$netConfigDir/id_$prefix" + rm -rf "$sshPrivateKey"{,.pub} + ( + set -x + ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey" + ) + echo "sshPrivateKey=$sshPrivateKey" >> "$configFile" + + recordInstanceIp() { + declare name="$1" + declare publicIp="$3" + declare privateIp="$4" + + declare arrayName="$6" + + if $internalNetwork; then + echo "$arrayName+=($privateIp) # $name" >> "$configFile" + else + echo "$arrayName+=($publicIp) # $name" >> "$configFile" + fi + } + + gcloud_FindInstances "name=$prefix-leader" show + [[ ${#instances[@]} -eq 1 ]] || { + echo "Unable to start leader" + exit 1 + } + gcloud_FigureRemoteUsername "${instances[0]}" + echo "sshUsername=$gcloud_username" >> "$configFile" + gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" + + echo "leaderIp=()" >> "$configFile" + gcloud_ForEachInstance recordInstanceIp leaderIp + + gcloud_FindInstances "name~^$prefix-validator" show + [[ ${#instances[@]} -gt 0 ]] || { + echo "Unable to start validators" + exit 1 + } + echo "validatorIpList=()" >> "$configFile" + gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" + gcloud_ForEachInstance recordInstanceIp validatorIpList + + echo "clientIpList=()" >> "$configFile" + gcloud_FindInstances "name~^$prefix-client" show + if [[ ${#instances[@]} -gt 0 ]]; then + gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" + gcloud_ForEachInstance recordInstanceIp clientIpList + fi + + echo "Wrote $configFile" +} + +case $command in +delete) + gcloud_FindInstances "name~^$prefix-" + + if [[ ${#instances[@]} -eq 0 ]]; then + echo "No instances found matching '^$prefix-'" + exit 0 + fi + gcloud_DeleteInstances + ;; + +create) + [[ -n $validatorNodeCount ]] || usage "Need number of nodes" + + gcloud_CreateInstances "$prefix-leader" 1 "$zone" "$imageName" + gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" "$imageName" + if [[ -n $clientNodeCount ]]; then + gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" "$imageName" + fi + writeConfigFile + ;; + +config) + writeConfigFile + ;; +*) + usage "Unknown command: $command" +esac diff --git a/net/init-metrics.sh b/net/init-metrics.sh new file mode 100755 index 0000000000..f281c36d5a --- /dev/null +++ b/net/init-metrics.sh @@ -0,0 +1,76 @@ +#!/bin/bash -e + +here=$(dirname "$0") +# shellcheck source=net/common.sh +source "$here"/common.sh + +usage() { + exitcode=0 + if [[ -n "$1" ]]; then + exitcode=1 + echo "Error: $*" + fi + cat <> "$configFile" + +exit 0 diff --git a/net/net.sh b/net/net.sh new file mode 100755 index 0000000000..4c73fb84c5 --- /dev/null +++ b/net/net.sh @@ -0,0 +1,197 @@ +#!/bin/bash -e + +here=$(dirname "$0") +SOLANA_ROOT="$(cd "$here"/..; pwd)" + +# shellcheck source=net/common.sh +source "$here"/common.sh + +usage() { + exitcode=0 + if [[ -n "$1" ]]; then + exitcode=1 + echo "Error: $*" + fi + cat <> "$logFile" +} + +startLeader() { + declare ipAddress=$1 + declare logFile="$2" + echo "****************" + echo "Starting leader: $leaderIp" + + common_start_setup "$ipAddress" "$logFile" + + ( + set -x + rsync -vPrz -e "ssh ${sshOptions[*]}" ~/.cargo/bin/solana* "$ipAddress":~/.cargo/bin/ + ssh "${sshOptions[@]}" -f "$ipAddress" \ + "./solana/net/remote/remote_leader.sh" + ) >> "$logFile" +} + +startValidator() { + declare ipAddress=$1 + declare logFile="$2" + echo "*******************" + echo "Starting validator: $leaderIp" + common_start_setup "$ipAddress" "$logFile" + + ( + set -x + ssh "${sshOptions[@]}" -f "$ipAddress" \ + "./solana/net/remote/remote_validator.sh $leaderIp" + ) >> "$logFile" +} + +startClient() { + declare ipAddress=$1 + declare logFile="$2" + echo "****************" + echo "Starting client: $leaderIp" + common_start_setup "$ipAddress" "$logFile" + + ssh "${sshOptions[@]}" -f "$ipAddress" \ + "./solana/net/remote/remote_client.sh $leaderIp" >> "$logFile" +} + +start() { + echo "Deployment started at $(date)" + SECONDS=0 + leaderDeployTime= + + startLeader "$leaderIp" "$netLogDir/leader-$leaderIp.log" + leaderDeployTime=$SECONDS + SECONDS=0 + + for ipAddress in "${validatorIpList[@]}"; do + startValidator "$ipAddress" "$netLogDir/validator-$ipAddress.log" & + done + + wait + validatorDeployTime=$SECONDS + SECONDS=0 + + for ipAddress in "${clientIpList[@]}"; do + startClient "$ipAddress" "$netLogDir/client-$ipAddress.log" + done + + clientDeployTime=$SECONDS + SECONDS=0 + wait + + echo + echo "=================================================================" + echo "Deployment finished at $(date)" + echo "Leader deployment took $leaderDeployTime seconds" + echo "Validator deployment (${#validatorIpList[@]} instances) took $validatorDeployTime seconds" + echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds" + echo "Logs in $netLogDir:" + ls -l "$netLogDir" +} + + +stop_node() { + local ipAddress=$1 + echo "**************" + echo "Stopping node: $ipAddress" + ( + set -x + ssh "${sshOptions[@]}" "$ipAddress" " + set -x; + pkill -9 solana-; + pkill -9 validator; + pkill -9 leader; + " + ) || true +} + +stop() { + SECONDS=0 + + stop_node "$leaderIp" + + for ipAddress in "${validatorIpList[@]}" "${clientIpList[@]}"; do + stop_node "$ipAddress" + done + + echo "Stopping nodes took $SECONDS seconds" +} + +mkdir -p log + +if [[ $command == "start" ]]; then + build + stop + start +elif [[ $command == "stop" ]]; then + stop +else + usage "Unknown command: $command" +fi diff --git a/net/remote/README.md b/net/remote/README.md new file mode 100644 index 0000000000..9e0aae3cbb --- /dev/null +++ b/net/remote/README.md @@ -0,0 +1 @@ +Scripts that run on the remote testnet nodes diff --git a/net/remote/remote_client.sh b/net/remote/remote_client.sh new file mode 100755 index 0000000000..9fa60e37cf --- /dev/null +++ b/net/remote/remote_client.sh @@ -0,0 +1,15 @@ +#!/bin/bash -e + +[[ -n $1 ]] || exit + +cd "$(dirname "$0")"/../.. +source net/common.sh +loadConfigFile + +PATH="$HOME"/.cargo/bin:"$PATH" +rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/ + +numNodes=1 # TODO: Pass this in +export USE_INSTALL=1 + +multinode-demo/client.sh "$1":~/solana $numNodes --loop -s 600 --sustained >client.log 2>&1 & diff --git a/net/remote/remote_leader.sh b/net/remote/remote_leader.sh new file mode 100755 index 0000000000..79e66110b1 --- /dev/null +++ b/net/remote/remote_leader.sh @@ -0,0 +1,15 @@ +#!/bin/bash -e + +cd "$(dirname "$0")"/../.. +source net/common.sh +loadConfigFile + +PATH="$HOME"/.cargo/bin:"$PATH" + +export USE_INSTALL=1 +export SOLANA_CUDA=1 + +./fetch-perf-libs.sh +./multinode-demo/setup.sh +./multinode-demo/drone.sh >drone.log 2>&1 & +./multinode-demo/leader.sh >leader.log 2>&1 & diff --git a/net/remote/remote_validator.sh b/net/remote/remote_validator.sh new file mode 100755 index 0000000000..7f500ee317 --- /dev/null +++ b/net/remote/remote_validator.sh @@ -0,0 +1,15 @@ +#!/bin/bash -e + +[[ -n $1 ]] || exit + +cd "$(dirname "$0")"/../.. +source net/common.sh +loadConfigFile + +PATH="$HOME"/.cargo/bin:"$PATH" + +rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/ + +export USE_INSTALL=1 +./multinode-demo/setup.sh +./multinode-demo/validator.sh "$1":~/solana "$1" >validator.log 2>&1 & diff --git a/net/ssh.sh b/net/ssh.sh new file mode 100755 index 0000000000..c5c7fede58 --- /dev/null +++ b/net/ssh.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +here=$(dirname "$0") +# shellcheck source=net/common.sh +source "$here"/common.sh + +usage() { + exitcode=0 + if [[ -n "$1" ]]; then + exitcode=1 + echo "Error: $*" + fi + cat < .ssh/authorized_keys; + echo \" + Host * + BatchMode yes + IdentityFile ~/.ssh/id_testnet + StrictHostKeyChecking no + \" > .ssh/config; + " + #gcloud compute scp --zone "$zone" "$publicKey" "$name":.ssh/authorized_keys + scp \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -i "$privateKey" \ + "$privateKey" "$username@$publicIp:.ssh/id_testnet" ) done }