From 82fea9ce735844b8b4a4264a2205201f875821c7 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Mon, 14 Oct 2019 10:33:32 -0600 Subject: [PATCH] net.sh: Add support for selecting validator GPU mode (#6326) automerge --- multinode-demo/bootstrap-leader.sh | 5 +++++ multinode-demo/validator.sh | 5 +++++ net/net.sh | 22 +++++++++++++++++++++- net/remote/remote-node.sh | 28 +++++++++++++++++++++++++++- 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/multinode-demo/bootstrap-leader.sh b/multinode-demo/bootstrap-leader.sh index 1410bdde7..dbbdcbf54 100755 --- a/multinode-demo/bootstrap-leader.sh +++ b/multinode-demo/bootstrap-leader.sh @@ -8,6 +8,11 @@ here=$(dirname "$0") # shellcheck source=multinode-demo/common.sh source "$here"/common.sh +if [[ "$SOLANA_GPU_MISSING" -eq 1 ]]; then + echo "Testnet requires GPUs, but none were found! Aborting..." + exit 1 +fi + if [[ -n $SOLANA_CUDA ]]; then program=$solana_validator_cuda else diff --git a/multinode-demo/validator.sh b/multinode-demo/validator.sh index 95cbe59dc..f06e07ac9 100755 --- a/multinode-demo/validator.sh +++ b/multinode-demo/validator.sh @@ -152,6 +152,11 @@ while [[ -n $1 ]]; do fi done +if [[ "$SOLANA_GPU_MISSING" -eq 1 ]]; then + echo "Testnet requires GPUs, but none were found! Aborting..." + exit 1 +fi + if [[ ${#positional_args[@]} -gt 1 ]]; then usage "$@" fi diff --git a/net/net.sh b/net/net.sh index a360ee822..79484d3e5 100755 --- a/net/net.sh +++ b/net/net.sh @@ -49,7 +49,13 @@ Operate a configured testnet This will start 2 bench-tps clients, and supply "--tx_count 25000" to the bench-tps client. -n NUM_FULL_NODES - Number of fullnodes to apply command to. - + --gpu-mode GPU_MODE - Specify GPU mode to launch validators with (default: $gpuMode). + MODE must be one of + on - GPU *required*, any vendor * + off - No GPU, CPU-only + auto - Use GPU if available, any vendor * + cuda - GPU *required*, Nvidia CUDA only + * Currently, Nvidia CUDA is the only supported GPU vendor --hashes-per-tick NUM_HASHES|sleep|auto - Override the default --hashes-per-tick for the cluster --no-airdrop @@ -130,6 +136,7 @@ maybeSkipLedgerVerify="" maybeDisableAirdrops="" debugBuild=false doBuild=true +gpuMode=auto command=$1 [[ -n $command ]] || usage @@ -187,6 +194,17 @@ while [[ -n $1 ]]; do elif [[ $1 = --debug ]]; then debugBuild=true shift 1 + elif [[ $1 = --gpu-mode ]]; then + gpuMode=$2 + case "$gpuMode" in + on|off|auto|cuda) + ;; + *) + echo "Unexpected GPU mode: \"$gpuMode\"" + exit 1 + ;; + esac + shift 2 else usage "Unknown long option: $1" fi @@ -424,6 +442,7 @@ startBootstrapLeader() { $numBenchExchangeClients \"$benchExchangeExtraArgs\" \ \"$genesisOptions\" \ \"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \ + \"$gpuMode\" \ " ) >> "$logFile" 2>&1 || { cat "$logFile" @@ -488,6 +507,7 @@ startNode() { $numBenchExchangeClients \"$benchExchangeExtraArgs\" \ \"$genesisOptions\" \ \"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \ + \"$gpuMode\" \ " ) >> "$logFile" 2>&1 & declare pid=$! diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index bf33f6f22..53d96923d 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -24,6 +24,7 @@ numBenchExchangeClients="${15}" benchExchangeExtraArgs="${16}" genesisOptions="${17}" extraNodeArgs="${18}" +gpuMode="${19:-auto}" set +x # Use a very large stake (relative to the default multinode-demo/ stake of 42) @@ -75,6 +76,28 @@ EOF chmod +x ~/solana/on-reboot echo "@reboot ~/solana/on-reboot" | crontab - +GPU_CUDA_OK=false +GPU_FAIL_IF_NONE=false +case "$gpuMode" in + on) # GPU *required*, any vendor + GPU_CUDA_OK=true + GPU_FAIL_IF_NONE=true + ;; + off) # CPU-only + ;; + auto) # Use GPU if installed, any vendor + GPU_CUDA_OK=true + ;; + cuda) # GPU *required*, CUDA-only + GPU_CUDA_OK=true + GPU_FAIL_IF_NONE=true + ;; + *) + echo "Unexpected gpuMode: \"$gpuMode\"" + exit 1 + ;; +esac + waitForNodeToInit() { echo "--- waiting for node to boot up" SECONDS= @@ -113,9 +136,12 @@ cat >> ~/solana/on-reboot < net-stats.log 2>&1 & echo \$! > net-stats.pid - if [[ -e /dev/nvidia0 ]]; then + if ${GPU_CUDA_OK} && [[ -e /dev/nvidia0 ]]; then echo Selecting solana-validator-cuda export SOLANA_CUDA=1 + elif ${GPU_FAIL_IF_NONE} ; then + echo "Expected GPU, found none!" + export SOLANA_GPU_MISSING=1 fi EOF