diff --git a/net/common.sh b/net/common.sh index 83621d24c..96423e0a0 100644 --- a/net/common.sh +++ b/net/common.sh @@ -11,25 +11,20 @@ netConfigDir="$(dirname "${BASH_SOURCE[0]}")"/config netLogDir="$(dirname "${BASH_SOURCE[0]}")"/log mkdir -p "$netConfigDir" "$netLogDir" +# shellcheck source=scripts/configure-metrics.sh +source "$(dirname "${BASH_SOURCE[0]}")"/../scripts/configure-metrics.sh + configFile="$netConfigDir/config" clientIpList=() leaderIp= +netBasename= +sshOptions=() sshPrivateKey= sshUsername= -sshOptions=() validatorIpList=() -loadConfigFile() { - [[ -r $configFile ]] || usage "Config file unreadable: $configFile" - - # shellcheck source=/dev/null - source "$configFile" - [[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile" - [[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile" - [[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile" - [[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile" - +buildSshOptions() { sshOptions=( -o "BatchMode=yes" -o "StrictHostKeyChecking=no" @@ -39,3 +34,18 @@ loadConfigFile() { -o "LogLevel=ERROR" ) } + +loadConfigFile() { + [[ -r $configFile ]] || usage "Config file unreadable: $configFile" + + # shellcheck source=/dev/null + source "$configFile" + [[ -n "$netBasename" ]] || usage "Config file invalid, netBasename unspecified: $configFile" + [[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile" + [[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile" + [[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile" + [[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile" + + buildSshOptions + configureMetrics +} diff --git a/net/gce.sh b/net/gce.sh index dd7d13639..10f997543 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -29,7 +29,7 @@ usage() { cat <> "$configFile" + echo "netBasename=$prefix" >> "$configFile" + declare sshPrivateKey="$netConfigDir/id_$prefix" rm -rf "$sshPrivateKey"{,.pub} ( @@ -117,17 +119,36 @@ writeConfigFile() { fi } + prepareInstance() { + declare name="$1" + declare publicIp="$3" + + # TODO: Make the following a requirement of $imageName + # instead of a manual install + ssh "${sshOptions[@]}" "$publicIp" " + set -ex; + sudo systemctl disable apt-daily.service # disable run when system boot + sudo systemctl disable apt-daily.timer # disable timer run + sudo apt-get --assume-yes install rsync libssl-dev; + mkdir -p ~/solana ~/.cargo/bin; + " + } + gcloud_FindInstances "name=$prefix-leader" show [[ ${#instances[@]} -eq 1 ]] || { echo "Unable to start leader" exit 1 } gcloud_FigureRemoteUsername "${instances[0]}" - echo "sshUsername=$gcloud_username" >> "$configFile" + sshUsername=$gcloud_username + echo "sshUsername=$sshUsername" >> "$configFile" + buildSshOptions + gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" echo "leaderIp=()" >> "$configFile" gcloud_ForEachInstance recordInstanceIp leaderIp + gcloud_ForEachInstance prepareInstance gcloud_FindInstances "name~^$prefix-validator" show [[ ${#instances[@]} -gt 0 ]] || { @@ -137,12 +158,14 @@ writeConfigFile() { echo "validatorIpList=()" >> "$configFile" gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" gcloud_ForEachInstance recordInstanceIp validatorIpList + gcloud_ForEachInstance prepareInstance echo "clientIpList=()" >> "$configFile" gcloud_FindInstances "name~^$prefix-client" show if [[ ${#instances[@]} -gt 0 ]]; then gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" gcloud_ForEachInstance recordInstanceIp clientIpList + gcloud_ForEachInstance prepareInstance fi echo "Wrote $configFile" @@ -177,11 +200,11 @@ create) "$zone" "$imageName" "$clientMachineType" "$clientAccelerator" fi - writeConfigFile + prepareInstancesAndWriteConfigFile ;; config) - writeConfigFile + prepareInstancesAndWriteConfigFile ;; *) usage "Unknown command: $command" diff --git a/net/init-metrics.sh b/net/init-metrics.sh index f281c36d5..04da893de 100755 --- a/net/init-metrics.sh +++ b/net/init-metrics.sh @@ -11,20 +11,18 @@ usage() { echo "Error: $*" fi cat <> "$configFile" exit 0 diff --git a/net/net.sh b/net/net.sh index eb758b5e8..3f5d8d37b 100755 --- a/net/net.sh +++ b/net/net.sh @@ -15,23 +15,60 @@ usage() { cat <> "$logFile" } @@ -106,7 +147,7 @@ startValidator() { ( set -x ssh "${sshOptions[@]}" -f "$ipAddress" \ - "./solana/net/remote/remote_validator.sh $leaderIp" + "./solana/net/remote/remote_node.sh $deployMethod validator $leaderIp \"$nodeSetupArgs\" \"$RUST_LOG\"" ) >> "$logFile" } @@ -117,35 +158,67 @@ startClient() { echo "Starting client: $leaderIp" common_start_setup "$ipAddress" "$logFile" + declare expectedNodeCount=$((${#validatorIpList[@]} + 1)) + ssh "${sshOptions[@]}" -f "$ipAddress" \ - "./solana/net/remote/remote_client.sh $leaderIp" >> "$logFile" + "./solana/net/remote/remote_client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\"" >> "$logFile" } start() { - echo "Deployment started at $(date)" - SECONDS=0 - leaderDeployTime= + [[ $command = "start" ]] || return + case $deployMethod in + snap) + if [[ -n $snapChannel ]]; then + if [[ $(uname) != Linux ]]; then + echo Error: snap channel deployment only supported in Linux + exit 1 + fi + usage "TODO: the snap download command below is probably wrong..." + snap download --"$snapChannel" solana + snapFilename=solana.snap + fi + ;; + local) + build + ;; + *) + usage "Internal error: invalid deployMethod: $deployMethod" + ;; + esac + + echo "Deployment started at $(date)" + + SECONDS=0 + declare leaderDeployTime= + declare networkVersion=unknown startLeader "$leaderIp" "$netLogDir/leader-$leaderIp.log" leaderDeployTime=$SECONDS - SECONDS=0 + SECONDS=0 for ipAddress in "${validatorIpList[@]}"; do startValidator "$ipAddress" "$netLogDir/validator-$ipAddress.log" & done - wait validatorDeployTime=$SECONDS - SECONDS=0 + SECONDS=0 for ipAddress in "${clientIpList[@]}"; do startClient "$ipAddress" "$netLogDir/client-$ipAddress.log" done - clientDeployTime=$SECONDS - SECONDS=0 wait + if [[ $deployMethod = "snap" ]]; then + IFS=\ read -r _ networkVersion _ < <( + ssh "${sshOptions[@]}" "$leaderIp" \ + "snap info solana | grep \"^installed:\"" + ) + networkVersion=${networkVersion/0+git./} + fi + + $metricsWriteDatapoint "testnet-deploy,name=$netBasename start=1,version=\"$networkVersion\"" + echo echo "=================================================================" echo "Deployment finished at $(date)" @@ -165,7 +238,11 @@ stop_node() { set -x ssh "${sshOptions[@]}" "$ipAddress" " set -x; - pkill -9 solana- remote_ oom-monitor + if snap list solana; then + sudo snap set solana mode=; + sudo snap remove solana; + fi; \ + pkill -9 solana- remote_ oom-monitor; " ) || true } @@ -173,6 +250,8 @@ stop_node() { stop() { SECONDS=0 + $metricsWriteDatapoint "testnet-deploy,name=$netBasename stop=1" + stop_node "$leaderIp" for ipAddress in "${validatorIpList[@]}" "${clientIpList[@]}"; do @@ -182,14 +261,6 @@ stop() { echo "Stopping nodes took $SECONDS seconds" } -mkdir -p log +stop +start -if [[ $command == "start" ]]; then - build - stop - start -elif [[ $command == "stop" ]]; then - stop -else - usage "Unknown command: $command" -fi diff --git a/net/remote/remote_client.sh b/net/remote/remote_client.sh index 114f10b73..5a7e4c5ee 100755 --- a/net/remote/remote_client.sh +++ b/net/remote/remote_client.sh @@ -1,18 +1,56 @@ #!/bin/bash -e -[[ -n $1 ]] || exit +deployMethod="$1" +netEntrypoint="$2" +numNodes="$3" +RUST_LOG="$4" +[[ -n $deployMethod ]] || exit +[[ -n $netEntrypoint ]] || exit +[[ -n $numNodes ]] || exit cd "$(dirname "$0")"/../.. source net/common.sh loadConfigFile -PATH="$HOME"/.cargo/bin:"$PATH" -rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/ - -numNodes=1 # TODO: Pass this in +threadCount=$(nproc) +if [[ $threadCount -gt 4 ]]; then + threadCount=4 +fi ./script/install-earlyoom.sh + +case $deployMethod in +snap) + sudo snap install solana.snap --devmode --dangerous + rm solana.snap + + sudo snap set solana metrics-config="$SOLANA_METRICS_CONFIG" rust-log="$RUST_LOG" + solana_bench_tps=/snap/bin/solana.bench-tps + ;; +local) + PATH="$HOME"/.cargo/bin:"$PATH" + export USE_INSTALL=1 + export RUST_LOG + + rsync -vPrz "$netEntrypoint:~/.cargo/bin/solana*" ~/.cargo/bin/ + solana_bench_tps=multinode-demo/client.sh + netEntrypoint="$:~/solana" + ;; +*) + echo "Unknown deployment method: $deployMethod" + exit 1 +esac + ./scripts/oom-monitor.sh > oom-monitor.log 2>&1 & -export USE_INSTALL=1 -multinode-demo/client.sh "$1":~/solana $numNodes --loop -s 600 --sustained > client.log 2>&1 & +while true; do + echo "=== Client start: $(date)" >> client.log + clientCommand="$solana_bench_tps $netEntrypoint $numNodes --loop -s 600 --sustained -t threadCount" + echo "$ $clientCommand" >> client.log + + $clientCommand >> client.log 2>&1 + + $metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1" + echo Error: bench-tps should never exit | tee -a client.log +done + diff --git a/net/remote/remote_leader.sh b/net/remote/remote_leader.sh deleted file mode 100755 index 077fd7c41..000000000 --- a/net/remote/remote_leader.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -e - -cd "$(dirname "$0")"/../.. -source net/common.sh -loadConfigFile - -PATH="$HOME"/.cargo/bin:"$PATH" - -./fetch-perf-libs.sh - -./script/install-earlyoom.sh -./scripts/oom-monitor.sh > oom-monitor.log 2>&1 & - -export USE_INSTALL=1 -export SOLANA_CUDA=1 -./multinode-demo/setup.sh -./multinode-demo/drone.sh > drone.log 2>&1 & -./multinode-demo/leader.sh > leader.log 2>&1 & diff --git a/net/remote/remote_node.sh b/net/remote/remote_node.sh new file mode 100755 index 000000000..4a5096109 --- /dev/null +++ b/net/remote/remote_node.sh @@ -0,0 +1,83 @@ +#!/bin/bash -e + +deployMethod="$1" +nodeType="$2" +netEntrypoint="$3" +setupArgs="$4" +RUST_LOG="$5" + +[[ -n $deployMethod ]] || exit +[[ -n $nodeType ]] || exit +[[ -n $netEntrypoint ]] || exit + +cd "$(dirname "$0")"/../.. +source net/common.sh +loadConfigFile + +./script/install-earlyoom.sh + +case $deployMethod in +snap) + SECONDS=0 + sudo snap install solana.snap --devmode --dangerous + rm solana.snap + + commonNodeConfig="\ + rust-log=$RUST_LOG \ + metrics-config=$SOLANA_METRICS_CONFIG \ + setup-args=$setupArgs \ + enable-cuda=1 \ + " + if [[ $nodeType = leader ]]; then + nodeConfig="mode=leader+drone $commonNodeConfig" + else + nodeConfig="mode=validator leader-address=$netEntrypoint $commonNodeConfig" + fi + + logmarker="solana deploy $(date)/$RANDOM" + logger "$logmarker" + + # shellcheck disable=SC2086 # Don't want to double quote "$nodeConfig" + sudo snap set solana $nodeConfig + snap info solana + sudo snap get solana + echo Slight delay to get more syslog output + sleep 2 + sudo grep -Pzo "$logmarker(.|\\n)*" /var/log/syslog + + echo "Succeeded in ${SECONDS} seconds" + ;; +local) + PATH="$HOME"/.cargo/bin:"$PATH" + export USE_INSTALL=1 + export SOLANA_CUDA=1 + export RUST_LOG=1 + + ./fetch-perf-libs.sh + ./scripts/oom-monitor.sh > oom-monitor.log 2>&1 & + + case $nodeType in + leader) + # shellcheck disable=SC2086 # Don't want to double quote "$setupArgs" + ./multinode-demo/setup.sh -t leader -p $setupArgs + ./multinode-demo/drone.sh > drone.log 2>&1 & + ./multinode-demo/leader.sh > leader.log 2>&1 & + ;; + validator) + rsync -vPrz "$netEntrypoint:~/.cargo/bin/solana*" ~/.cargo/bin/ + + # shellcheck disable=SC2086 # Don't want to double quote "$setupArgs" + ./multinode-demo/setup.sh -t validator -p $setupArgs + ./multinode-demo/validator.sh "$netEntrypoint":~/solana "$netEntrypoint" >validator.log 2>&1 & + ;; + *) + echo "Error: unknown node type: $nodeType" + exit 1 + ;; + esac + ;; +*) + echo "Unknown deployment method: $deployMethod" + exit 1 +esac + diff --git a/net/remote/remote_validator.sh b/net/remote/remote_validator.sh deleted file mode 100755 index fd4afd9cf..000000000 --- a/net/remote/remote_validator.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -e - -[[ -n $1 ]] || exit - -cd "$(dirname "$0")"/../.. -source net/common.sh -loadConfigFile - -PATH="$HOME"/.cargo/bin:"$PATH" - -rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/ - -./script/install-earlyoom.sh -./scripts/oom-monitor.sh > oom-monitor.log 2>&1 & - -export USE_INSTALL=1 -./multinode-demo/setup.sh -./multinode-demo/validator.sh "$1":~/solana "$1" >validator.log 2>&1 & diff --git a/scripts/configure-metrics.sh b/scripts/configure-metrics.sh index 95b6e803d..5186bccfd 100644 --- a/scripts/configure-metrics.sh +++ b/scripts/configure-metrics.sh @@ -6,12 +6,18 @@ # Example: # export SOLANA_METRICS_CONFIG="host=,db=,u=,p=" # -configure_metrics() { +# The following directive disable complaints about unused variables in this +# file: +# shellcheck disable=2034 +# +metricsWriteDatapoint="$(dirname "${BASH_SOURCE[0]}")"/metrics-write-datapoint.sh + +configureMetrics() { [[ -n $SOLANA_METRICS_CONFIG ]] || return 0 - declare metrics_params - IFS=',' read -r -a metrics_params <<< "$SOLANA_METRICS_CONFIG" - for param in "${metrics_params[@]}"; do + declare metricsParams + IFS=',' read -r -a metricsParams <<< "$SOLANA_METRICS_CONFIG" + for param in "${metricsParams[@]}"; do IFS='=' read -r -a pair <<< "$param" if [[ ${#pair[@]} != 2 ]]; then echo Error: invalid metrics parameter: "$param" >&2 @@ -42,4 +48,4 @@ configure_metrics() { fi done } -configure_metrics +configureMetrics