Specify machine type without necessarily enabling GPU (#6529)

* Specifiy machine type without necessarily enabling GPU * Make long arg, extend --enable-gpu to automation * Set machine types only in one place * Fixup * Fixup flag in automation * Typo * shellcheck
2019-10-24 15:12:25 -06:00 · 2019-10-24 15:12:25 -06:00 · dadcb632d8
parent 2de2fbd5e3
commit dadcb632d8
7 changed files with 49 additions and 23 deletions
--- a/net/gce.sh
+++ b/net/gce.sh
@ -14,8 +14,6 @@ gce)
  cpuBootstrapLeaderMachineType="--custom-cpu 12 --custom-memory 32GB --min-cpu-platform Intel%20Skylake"
  gpuBootstrapLeaderMachineType="$cpuBootstrapLeaderMachineType --accelerator count=1,type=nvidia-tesla-p100"
  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
  validatorMachineType=$cpuBootstrapLeaderMachineType
  clientMachineType="--custom-cpu 16 --custom-memory 20GB"
  blockstreamerMachineType="--machine-type n1-standard-8"
  archiverMachineType="--custom-cpu 4 --custom-memory 16GB"
@ -30,8 +28,6 @@ ec2)
  #       AVX-512 support.  The default, p2.xlarge, does not support
  #       AVX-512
  gpuBootstrapLeaderMachineType=p2.xlarge
  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
  validatorMachineType=$cpuBootstrapLeaderMachineType
  clientMachineType=c5.2xlarge
  blockstreamerMachineType=c5.2xlarge
  archiverMachineType=c5.xlarge
@ -43,8 +39,6 @@ azure)
  # TODO: Dial in machine types for Azure
  cpuBootstrapLeaderMachineType=Standard_D16s_v3
  gpuBootstrapLeaderMachineType=Standard_NC12
  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
  validatorMachineType=$cpuBootstrapLeaderMachineType
  clientMachineType=Standard_D16s_v3
  blockstreamerMachineType=Standard_D16s_v3
  archiverMachineType=Standard_D4s_v3
@ -55,8 +49,6 @@ colo)
  cpuBootstrapLeaderMachineType=0
  gpuBootstrapLeaderMachineType=1
  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
  validatorMachineType=$cpuBootstrapLeaderMachineType
  clientMachineType=0
  blockstreamerMachineType=0
  archiverMachineType=0
@ -84,6 +76,7 @@ evalInfo=false
 publicNetwork=false
 letsEncryptDomainName=
 enableGpu=false
 customMachineType=
 customAddress=
 zones=()
@ -131,8 +124,9 @@ Manage testnet instances
   -r [number]      - Number of archiver nodes (default: $archiverNodeCount)
   -u               - Include a Blockstreamer (default: $blockstreamer)
   -P               - Use public network IP addresses (default: $publicNetwork)
-   -g               - Enable GPU (default: $enableGpu)
+   -g               - Enable GPU and automatically set validator machine types to $gpuBootstrapLeaderMachineType
-   -G               - Enable GPU, and set count/type of GPUs to use
+                      (default: $enableGpu)
   -G               - Enable GPU, and set custom GPU machine type to use
                      (e.g $gpuBootstrapLeaderMachineType)
   -a [address]     - Address to be be assigned to the Blockstreamer if present,
                      otherwise the bootstrap validator.
@ -141,9 +135,14 @@ Manage testnet instances
                      * For EC2, [address] is the "allocation ID" of the desired
                        Elastic IP.
   -d [disk-type]   - Specify a boot disk type (default None) Use pd-ssd to get ssd on GCE.
-   --letsencrypt [dns name] - Attempt to generate a TLS certificate using this
+   --letsencrypt [dns name]
-                              DNS name (useful only when the -a and -P options
+                    - Attempt to generate a TLS certificate using this
-                              are also provided)
+                      DNS name (useful only when the -a and -P options
                      are also provided)
   --custom-machine-type
                    - Set a custom machine type without assuming whether or not
                      GPU is enabled.  Set this explicitly with --enable-gpu/-g to call out the presence of GPUs.
   --enable-gpu     - Use with --custom-machine-type to specify whether or not GPUs should be used/enabled
   --validator-additional-disk-size-gb [number]
                    - Add an additional [number] GB SSD to all validators to store the config directory.
                      If not set, config will be written to the boot disk by default.
@ -195,6 +194,12 @@ while [[ -n $1 ]]; do
    elif [[ $1 == --eval ]]; then
      evalInfo=true
      shift
    elif [[ $1 == --enable-gpu ]]; then
      enableGpu=true
      shift
    elif [[ $1 = --custom-machine-type ]]; then
      customMachineType="$2"
      shift 2
    else
      usage "Unknown long option: $1"
    fi
@ -230,15 +235,10 @@ while getopts "h?p:Pn:c:r:z:gG:a:d:uxf" opt "${shortArgs[@]}"; do
    ;;
  g)
    enableGpu=true
    bootstrapLeaderMachineType=$gpuBootstrapLeaderMachineType
    validatorMachineType=$bootstrapLeaderMachineType
    blockstreamerMachineType=$bootstrapLeaderMachineType
    ;;
  G)
    enableGpu=true
-    bootstrapLeaderMachineType="$OPTARG"
+    customMachineType="$OPTARG"
    validatorMachineType=$bootstrapLeaderMachineType
    blockstreamerMachineType=$bootstrapLeaderMachineType
    ;;
  a)
    customAddress=$OPTARG
@ -258,6 +258,16 @@ while getopts "h?p:Pn:c:r:z:gG:a:d:uxf" opt "${shortArgs[@]}"; do
  esac
 done
 if [[ -n "$customMachineType" ]] ; then
  bootstrapLeaderMachineType="$customMachineType"
 elif [[ "$enableGpu" = "true" ]] ; then
  bootstrapLeaderMachineType="$gpuBootstrapLeaderMachineType"
 else
  bootstrapLeaderMachineType="$cpuBootstrapLeaderMachineType"
 fi
 validatorMachineType=$bootstrapLeaderMachineType
 blockstreamerMachineType=$bootstrapLeaderMachineType
 [[ ${#zones[@]} -gt 0 ]] || zones+=("$(cloud_DefaultZone)")
 [[ -z $1 ]] || usage "Unexpected argument: $1"
--- a/system-test/testnet-performance/colo-gpu-perf.yml
+++ b/system-test/testnet-performance/colo-gpu-perf.yml
@ -5,6 +5,7 @@ steps:
      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "colo"
      TESTNET_TAG: "colo-edge-perf-gpu-enabled"
      ENABLE_GPU: "true"
      RAMP_UP_TIME: 0
      TEST_DURATION_SECONDS: 600
      NUMBER_OF_VALIDATOR_NODES: 4
--- a/system-test/testnet-performance/gce-cpu-only-perf-10-node.yml
+++ b/system-test/testnet-performance/gce-cpu-only-perf-10-node.yml
@ -8,6 +8,7 @@ steps:
      RAMP_UP_TIME: 60
      TEST_DURATION_SECONDS: 300
      NUMBER_OF_VALIDATOR_NODES: 10
      ENABLE_GPU: "false"
      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
      NUMBER_OF_CLIENT_NODES: 1
      CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000"
--- a/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml
+++ b/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml
@ -8,6 +8,8 @@ steps:
      RAMP_UP_TIME: 0
      TEST_DURATION_SECONDS: 600
      NUMBER_OF_VALIDATOR_NODES: 5
      ENABLE_GPU: "false"
      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
      NUMBER_OF_CLIENT_NODES: 2
      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
      TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
--- a/system-test/testnet-performance/gce-gpu-perf-5-node.yml
+++ b/system-test/testnet-performance/gce-gpu-perf-5-node.yml
@ -8,6 +8,7 @@ steps:
      RAMP_UP_TIME: 0
      TEST_DURATION_SECONDS: 600
      NUMBER_OF_VALIDATOR_NODES: 5
      ENABLE_GPU: "true"
      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
      NUMBER_OF_CLIENT_NODES: 2
      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
--- a/system-test/testnet-performance/gce-gpu-perf-50-node.yml
+++ b/system-test/testnet-performance/gce-gpu-perf-50-node.yml
@ -8,10 +8,11 @@ steps:
      RAMP_UP_TIME: 0
      TEST_DURATION_SECONDS: 600
      NUMBER_OF_VALIDATOR_NODES: 50
      ENABLE_GPU: "true"
      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
      NUMBER_OF_CLIENT_NODES: 2
      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
      TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
-      ADDITIONAL_FLAGS: ""
+      ADDITIONAL_FLAGS: "--dedicated --allow-boot-failures"
    agents:
      - "queue=testnet-deploy"
--- a/system-test/testnet-performance/testnet-automation.sh
+++ b/system-test/testnet-performance/testnet-automation.sh
@ -74,16 +74,18 @@ function launchTestnet() {
  case $CLOUD_PROVIDER in
    gce)
    # shellcheck disable=SC2068
    # shellcheck disable=SC2086
      net/gce.sh create \
        -d pd-ssd \
        -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
-        "$maybeMachineType" "$VALIDATOR_NODE_MACHINE_TYPE" \
+        $maybeCustomMachineType $VALIDATOR_NODE_MACHINE_TYPE "$maybeEnableGpu" \
        -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} ${ADDITIONAL_FLAGS[@]/#/" "}
      ;;
    colo)
    # shellcheck disable=SC2068
    # shellcheck disable=SC2086
      net/colo.sh create \
-        -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" -g \
+        -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" "$maybeEnableGpu" \
        -p "$TESTNET_TAG" ${ADDITIONAL_FLAGS[@]/#/" "}
      ;;
    *)
@ -169,6 +171,13 @@ if [[ -z $NUMBER_OF_VALIDATOR_NODES ]] ; then
  exit 1
 fi
 if [[ -z $ENABLE_GPU ]] ; then
  ENABLE_GPU=false
 fi
 if [[ "$ENABLE_GPU" = "true" ]] ; then
  maybeEnableGpu="--enable-gpu"
 fi
 if [[ -z $NUMBER_OF_CLIENT_NODES ]] ; then
  echo NUMBER_OF_CLIENT_NODES not defined
  exit 1
@ -193,7 +202,7 @@ source ci/upload-ci-artifact.sh
 source system-test/testnet-performance/upload_results_to_slack.sh
 maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
-maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"}
+maybeCustomMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"--custom-machine-type"}
 IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
@ -203,6 +212,7 @@ RESULT_DETAILS="Test failed to finish"
 TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
                        NUMBER_OF_VALIDATOR_NODES \
                        ENABLE_GPU \
                        VALIDATOR_NODE_MACHINE_TYPE \
                        NUMBER_OF_CLIENT_NODES \
                        CLIENT_OPTIONS \