From dadcb632d83080b462e895a3a7c550c427c18f6c Mon Sep 17 00:00:00 2001
From: Dan Albert <dan@solana.com>
Date: Thu, 24 Oct 2019 15:12:25 -0600
Subject: [PATCH] Specify machine type without necessarily enabling GPU (#6529)

* Specifiy machine type without necessarily enabling GPU

* Make long arg, extend --enable-gpu to automation

* Set machine types only in one place

* Fixup

* Fixup flag in automation

* Typo

* shellcheck
---
 net/gce.sh                                    | 48 +++++++++++--------
 .../testnet-performance/colo-gpu-perf.yml     |  1 +
 ...perf.yml => gce-cpu-only-perf-10-node.yml} |  1 +
 .../gce-cpu-only-perf-5-node.yml              |  2 +
 .../gce-gpu-perf-5-node.yml                   |  1 +
 ...-gpu-perf.yml => gce-gpu-perf-50-node.yml} |  3 +-
 .../testnet-performance/testnet-automation.sh | 16 +++++--
 7 files changed, 49 insertions(+), 23 deletions(-)
 rename system-test/testnet-performance/{gce-cpu-only-perf.yml => gce-cpu-only-perf-10-node.yml} (96%)
 rename system-test/testnet-performance/{gce-gpu-perf.yml => gce-gpu-perf-50-node.yml} (88%)

diff --git a/net/gce.sh b/net/gce.sh
index 7ca2f4066..8ff8b378f 100755
--- a/net/gce.sh
+++ b/net/gce.sh
@@ -14,8 +14,6 @@ gce)
 
   cpuBootstrapLeaderMachineType="--custom-cpu 12 --custom-memory 32GB --min-cpu-platform Intel%20Skylake"
   gpuBootstrapLeaderMachineType="$cpuBootstrapLeaderMachineType --accelerator count=1,type=nvidia-tesla-p100"
-  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
-  validatorMachineType=$cpuBootstrapLeaderMachineType
   clientMachineType="--custom-cpu 16 --custom-memory 20GB"
   blockstreamerMachineType="--machine-type n1-standard-8"
   archiverMachineType="--custom-cpu 4 --custom-memory 16GB"
@@ -30,8 +28,6 @@ ec2)
   #       AVX-512 support.  The default, p2.xlarge, does not support
   #       AVX-512
   gpuBootstrapLeaderMachineType=p2.xlarge
-  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
-  validatorMachineType=$cpuBootstrapLeaderMachineType
   clientMachineType=c5.2xlarge
   blockstreamerMachineType=c5.2xlarge
   archiverMachineType=c5.xlarge
@@ -43,8 +39,6 @@ azure)
   # TODO: Dial in machine types for Azure
   cpuBootstrapLeaderMachineType=Standard_D16s_v3
   gpuBootstrapLeaderMachineType=Standard_NC12
-  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
-  validatorMachineType=$cpuBootstrapLeaderMachineType
   clientMachineType=Standard_D16s_v3
   blockstreamerMachineType=Standard_D16s_v3
   archiverMachineType=Standard_D4s_v3
@@ -55,8 +49,6 @@ colo)
 
   cpuBootstrapLeaderMachineType=0
   gpuBootstrapLeaderMachineType=1
-  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
-  validatorMachineType=$cpuBootstrapLeaderMachineType
   clientMachineType=0
   blockstreamerMachineType=0
   archiverMachineType=0
@@ -84,6 +76,7 @@ evalInfo=false
 publicNetwork=false
 letsEncryptDomainName=
 enableGpu=false
+customMachineType=
 customAddress=
 zones=()
 
@@ -131,8 +124,9 @@ Manage testnet instances
    -r [number]      - Number of archiver nodes (default: $archiverNodeCount)
    -u               - Include a Blockstreamer (default: $blockstreamer)
    -P               - Use public network IP addresses (default: $publicNetwork)
-   -g               - Enable GPU (default: $enableGpu)
-   -G               - Enable GPU, and set count/type of GPUs to use
+   -g               - Enable GPU and automatically set validator machine types to $gpuBootstrapLeaderMachineType
+                      (default: $enableGpu)
+   -G               - Enable GPU, and set custom GPU machine type to use
                       (e.g $gpuBootstrapLeaderMachineType)
    -a [address]     - Address to be be assigned to the Blockstreamer if present,
                       otherwise the bootstrap validator.
@@ -141,9 +135,14 @@ Manage testnet instances
                       * For EC2, [address] is the "allocation ID" of the desired
                         Elastic IP.
    -d [disk-type]   - Specify a boot disk type (default None) Use pd-ssd to get ssd on GCE.
-   --letsencrypt [dns name] - Attempt to generate a TLS certificate using this
-                              DNS name (useful only when the -a and -P options
-                              are also provided)
+   --letsencrypt [dns name]
+                    - Attempt to generate a TLS certificate using this
+                      DNS name (useful only when the -a and -P options
+                      are also provided)
+   --custom-machine-type
+                    - Set a custom machine type without assuming whether or not
+                      GPU is enabled.  Set this explicitly with --enable-gpu/-g to call out the presence of GPUs.
+   --enable-gpu     - Use with --custom-machine-type to specify whether or not GPUs should be used/enabled
    --validator-additional-disk-size-gb [number]
                     - Add an additional [number] GB SSD to all validators to store the config directory.
                       If not set, config will be written to the boot disk by default.
@@ -195,6 +194,12 @@ while [[ -n $1 ]]; do
     elif [[ $1 == --eval ]]; then
       evalInfo=true
       shift
+    elif [[ $1 == --enable-gpu ]]; then
+      enableGpu=true
+      shift
+    elif [[ $1 = --custom-machine-type ]]; then
+      customMachineType="$2"
+      shift 2
     else
       usage "Unknown long option: $1"
     fi
@@ -230,15 +235,10 @@ while getopts "h?p:Pn:c:r:z:gG:a:d:uxf" opt "${shortArgs[@]}"; do
     ;;
   g)
     enableGpu=true
-    bootstrapLeaderMachineType=$gpuBootstrapLeaderMachineType
-    validatorMachineType=$bootstrapLeaderMachineType
-    blockstreamerMachineType=$bootstrapLeaderMachineType
     ;;
   G)
     enableGpu=true
-    bootstrapLeaderMachineType="$OPTARG"
-    validatorMachineType=$bootstrapLeaderMachineType
-    blockstreamerMachineType=$bootstrapLeaderMachineType
+    customMachineType="$OPTARG"
     ;;
   a)
     customAddress=$OPTARG
@@ -258,6 +258,16 @@ while getopts "h?p:Pn:c:r:z:gG:a:d:uxf" opt "${shortArgs[@]}"; do
   esac
 done
 
+if [[ -n "$customMachineType" ]] ; then
+  bootstrapLeaderMachineType="$customMachineType"
+elif [[ "$enableGpu" = "true" ]] ; then
+  bootstrapLeaderMachineType="$gpuBootstrapLeaderMachineType"
+else
+  bootstrapLeaderMachineType="$cpuBootstrapLeaderMachineType"
+fi
+validatorMachineType=$bootstrapLeaderMachineType
+blockstreamerMachineType=$bootstrapLeaderMachineType
+
 [[ ${#zones[@]} -gt 0 ]] || zones+=("$(cloud_DefaultZone)")
 
 [[ -z $1 ]] || usage "Unexpected argument: $1"
diff --git a/system-test/testnet-performance/colo-gpu-perf.yml b/system-test/testnet-performance/colo-gpu-perf.yml
index 48f763561..851a1e355 100755
--- a/system-test/testnet-performance/colo-gpu-perf.yml
+++ b/system-test/testnet-performance/colo-gpu-perf.yml
@@ -5,6 +5,7 @@ steps:
       UPLOAD_RESULTS_TO_SLACK: "true"
       CLOUD_PROVIDER: "colo"
       TESTNET_TAG: "colo-edge-perf-gpu-enabled"
+      ENABLE_GPU: "true"
       RAMP_UP_TIME: 0
       TEST_DURATION_SECONDS: 600
       NUMBER_OF_VALIDATOR_NODES: 4
diff --git a/system-test/testnet-performance/gce-cpu-only-perf.yml b/system-test/testnet-performance/gce-cpu-only-perf-10-node.yml
similarity index 96%
rename from system-test/testnet-performance/gce-cpu-only-perf.yml
rename to system-test/testnet-performance/gce-cpu-only-perf-10-node.yml
index 1fc39359e..01b460857 100755
--- a/system-test/testnet-performance/gce-cpu-only-perf.yml
+++ b/system-test/testnet-performance/gce-cpu-only-perf-10-node.yml
@@ -8,6 +8,7 @@ steps:
       RAMP_UP_TIME: 60
       TEST_DURATION_SECONDS: 300
       NUMBER_OF_VALIDATOR_NODES: 10
+      ENABLE_GPU: "false"
       VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
       NUMBER_OF_CLIENT_NODES: 1
       CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000"
diff --git a/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml b/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml
index c72a9d6e1..f19120898 100755
--- a/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml
+++ b/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml
@@ -8,6 +8,8 @@ steps:
       RAMP_UP_TIME: 0
       TEST_DURATION_SECONDS: 600
       NUMBER_OF_VALIDATOR_NODES: 5
+      ENABLE_GPU: "false"
+      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
       NUMBER_OF_CLIENT_NODES: 2
       CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
       TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
diff --git a/system-test/testnet-performance/gce-gpu-perf-5-node.yml b/system-test/testnet-performance/gce-gpu-perf-5-node.yml
index 9b8e63c35..785bd4234 100755
--- a/system-test/testnet-performance/gce-gpu-perf-5-node.yml
+++ b/system-test/testnet-performance/gce-gpu-perf-5-node.yml
@@ -8,6 +8,7 @@ steps:
       RAMP_UP_TIME: 0
       TEST_DURATION_SECONDS: 600
       NUMBER_OF_VALIDATOR_NODES: 5
+      ENABLE_GPU: "true"
       VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
       NUMBER_OF_CLIENT_NODES: 2
       CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
diff --git a/system-test/testnet-performance/gce-gpu-perf.yml b/system-test/testnet-performance/gce-gpu-perf-50-node.yml
similarity index 88%
rename from system-test/testnet-performance/gce-gpu-perf.yml
rename to system-test/testnet-performance/gce-gpu-perf-50-node.yml
index 81163877f..e3cd2eea7 100755
--- a/system-test/testnet-performance/gce-gpu-perf.yml
+++ b/system-test/testnet-performance/gce-gpu-perf-50-node.yml
@@ -8,10 +8,11 @@ steps:
       RAMP_UP_TIME: 0
       TEST_DURATION_SECONDS: 600
       NUMBER_OF_VALIDATOR_NODES: 50
+      ENABLE_GPU: "true"
       VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
       NUMBER_OF_CLIENT_NODES: 2
       CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
       TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
-      ADDITIONAL_FLAGS: ""
+      ADDITIONAL_FLAGS: "--dedicated --allow-boot-failures"
     agents:
       - "queue=testnet-deploy"
diff --git a/system-test/testnet-performance/testnet-automation.sh b/system-test/testnet-performance/testnet-automation.sh
index 20c833749..6ef5cf131 100755
--- a/system-test/testnet-performance/testnet-automation.sh
+++ b/system-test/testnet-performance/testnet-automation.sh
@@ -74,16 +74,18 @@ function launchTestnet() {
   case $CLOUD_PROVIDER in
     gce)
     # shellcheck disable=SC2068
+    # shellcheck disable=SC2086
       net/gce.sh create \
         -d pd-ssd \
         -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
-        "$maybeMachineType" "$VALIDATOR_NODE_MACHINE_TYPE" \
+        $maybeCustomMachineType $VALIDATOR_NODE_MACHINE_TYPE "$maybeEnableGpu" \
         -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} ${ADDITIONAL_FLAGS[@]/#/" "}
       ;;
     colo)
     # shellcheck disable=SC2068
+    # shellcheck disable=SC2086
       net/colo.sh create \
-        -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" -g \
+        -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" "$maybeEnableGpu" \
         -p "$TESTNET_TAG" ${ADDITIONAL_FLAGS[@]/#/" "}
       ;;
     *)
@@ -169,6 +171,13 @@ if [[ -z $NUMBER_OF_VALIDATOR_NODES ]] ; then
   exit 1
 fi
 
+if [[ -z $ENABLE_GPU ]] ; then
+  ENABLE_GPU=false
+fi
+if [[ "$ENABLE_GPU" = "true" ]] ; then
+  maybeEnableGpu="--enable-gpu"
+fi
+
 if [[ -z $NUMBER_OF_CLIENT_NODES ]] ; then
   echo NUMBER_OF_CLIENT_NODES not defined
   exit 1
@@ -193,7 +202,7 @@ source ci/upload-ci-artifact.sh
 source system-test/testnet-performance/upload_results_to_slack.sh
 
 maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
-maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"}
+maybeCustomMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"--custom-machine-type"}
 
 IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
 
@@ -203,6 +212,7 @@ RESULT_DETAILS="Test failed to finish"
 
 TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
                         NUMBER_OF_VALIDATOR_NODES \
+                        ENABLE_GPU \
                         VALIDATOR_NODE_MACHINE_TYPE \
                         NUMBER_OF_CLIENT_NODES \
                         CLIENT_OPTIONS \