From dadcb632d83080b462e895a3a7c550c427c18f6c Mon Sep 17 00:00:00 2001 From: Dan Albert Date: Thu, 24 Oct 2019 15:12:25 -0600 Subject: [PATCH] Specify machine type without necessarily enabling GPU (#6529) * Specifiy machine type without necessarily enabling GPU * Make long arg, extend --enable-gpu to automation * Set machine types only in one place * Fixup * Fixup flag in automation * Typo * shellcheck --- net/gce.sh | 48 +++++++++++-------- .../testnet-performance/colo-gpu-perf.yml | 1 + ...perf.yml => gce-cpu-only-perf-10-node.yml} | 1 + .../gce-cpu-only-perf-5-node.yml | 2 + .../gce-gpu-perf-5-node.yml | 1 + ...-gpu-perf.yml => gce-gpu-perf-50-node.yml} | 3 +- .../testnet-performance/testnet-automation.sh | 16 +++++-- 7 files changed, 49 insertions(+), 23 deletions(-) rename system-test/testnet-performance/{gce-cpu-only-perf.yml => gce-cpu-only-perf-10-node.yml} (96%) rename system-test/testnet-performance/{gce-gpu-perf.yml => gce-gpu-perf-50-node.yml} (88%) diff --git a/net/gce.sh b/net/gce.sh index 7ca2f4066..8ff8b378f 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -14,8 +14,6 @@ gce) cpuBootstrapLeaderMachineType="--custom-cpu 12 --custom-memory 32GB --min-cpu-platform Intel%20Skylake" gpuBootstrapLeaderMachineType="$cpuBootstrapLeaderMachineType --accelerator count=1,type=nvidia-tesla-p100" - bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType - validatorMachineType=$cpuBootstrapLeaderMachineType clientMachineType="--custom-cpu 16 --custom-memory 20GB" blockstreamerMachineType="--machine-type n1-standard-8" archiverMachineType="--custom-cpu 4 --custom-memory 16GB" @@ -30,8 +28,6 @@ ec2) # AVX-512 support. The default, p2.xlarge, does not support # AVX-512 gpuBootstrapLeaderMachineType=p2.xlarge - bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType - validatorMachineType=$cpuBootstrapLeaderMachineType clientMachineType=c5.2xlarge blockstreamerMachineType=c5.2xlarge archiverMachineType=c5.xlarge @@ -43,8 +39,6 @@ azure) # TODO: Dial in machine types for Azure cpuBootstrapLeaderMachineType=Standard_D16s_v3 gpuBootstrapLeaderMachineType=Standard_NC12 - bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType - validatorMachineType=$cpuBootstrapLeaderMachineType clientMachineType=Standard_D16s_v3 blockstreamerMachineType=Standard_D16s_v3 archiverMachineType=Standard_D4s_v3 @@ -55,8 +49,6 @@ colo) cpuBootstrapLeaderMachineType=0 gpuBootstrapLeaderMachineType=1 - bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType - validatorMachineType=$cpuBootstrapLeaderMachineType clientMachineType=0 blockstreamerMachineType=0 archiverMachineType=0 @@ -84,6 +76,7 @@ evalInfo=false publicNetwork=false letsEncryptDomainName= enableGpu=false +customMachineType= customAddress= zones=() @@ -131,8 +124,9 @@ Manage testnet instances -r [number] - Number of archiver nodes (default: $archiverNodeCount) -u - Include a Blockstreamer (default: $blockstreamer) -P - Use public network IP addresses (default: $publicNetwork) - -g - Enable GPU (default: $enableGpu) - -G - Enable GPU, and set count/type of GPUs to use + -g - Enable GPU and automatically set validator machine types to $gpuBootstrapLeaderMachineType + (default: $enableGpu) + -G - Enable GPU, and set custom GPU machine type to use (e.g $gpuBootstrapLeaderMachineType) -a [address] - Address to be be assigned to the Blockstreamer if present, otherwise the bootstrap validator. @@ -141,9 +135,14 @@ Manage testnet instances * For EC2, [address] is the "allocation ID" of the desired Elastic IP. -d [disk-type] - Specify a boot disk type (default None) Use pd-ssd to get ssd on GCE. - --letsencrypt [dns name] - Attempt to generate a TLS certificate using this - DNS name (useful only when the -a and -P options - are also provided) + --letsencrypt [dns name] + - Attempt to generate a TLS certificate using this + DNS name (useful only when the -a and -P options + are also provided) + --custom-machine-type + - Set a custom machine type without assuming whether or not + GPU is enabled. Set this explicitly with --enable-gpu/-g to call out the presence of GPUs. + --enable-gpu - Use with --custom-machine-type to specify whether or not GPUs should be used/enabled --validator-additional-disk-size-gb [number] - Add an additional [number] GB SSD to all validators to store the config directory. If not set, config will be written to the boot disk by default. @@ -195,6 +194,12 @@ while [[ -n $1 ]]; do elif [[ $1 == --eval ]]; then evalInfo=true shift + elif [[ $1 == --enable-gpu ]]; then + enableGpu=true + shift + elif [[ $1 = --custom-machine-type ]]; then + customMachineType="$2" + shift 2 else usage "Unknown long option: $1" fi @@ -230,15 +235,10 @@ while getopts "h?p:Pn:c:r:z:gG:a:d:uxf" opt "${shortArgs[@]}"; do ;; g) enableGpu=true - bootstrapLeaderMachineType=$gpuBootstrapLeaderMachineType - validatorMachineType=$bootstrapLeaderMachineType - blockstreamerMachineType=$bootstrapLeaderMachineType ;; G) enableGpu=true - bootstrapLeaderMachineType="$OPTARG" - validatorMachineType=$bootstrapLeaderMachineType - blockstreamerMachineType=$bootstrapLeaderMachineType + customMachineType="$OPTARG" ;; a) customAddress=$OPTARG @@ -258,6 +258,16 @@ while getopts "h?p:Pn:c:r:z:gG:a:d:uxf" opt "${shortArgs[@]}"; do esac done +if [[ -n "$customMachineType" ]] ; then + bootstrapLeaderMachineType="$customMachineType" +elif [[ "$enableGpu" = "true" ]] ; then + bootstrapLeaderMachineType="$gpuBootstrapLeaderMachineType" +else + bootstrapLeaderMachineType="$cpuBootstrapLeaderMachineType" +fi +validatorMachineType=$bootstrapLeaderMachineType +blockstreamerMachineType=$bootstrapLeaderMachineType + [[ ${#zones[@]} -gt 0 ]] || zones+=("$(cloud_DefaultZone)") [[ -z $1 ]] || usage "Unexpected argument: $1" diff --git a/system-test/testnet-performance/colo-gpu-perf.yml b/system-test/testnet-performance/colo-gpu-perf.yml index 48f763561..851a1e355 100755 --- a/system-test/testnet-performance/colo-gpu-perf.yml +++ b/system-test/testnet-performance/colo-gpu-perf.yml @@ -5,6 +5,7 @@ steps: UPLOAD_RESULTS_TO_SLACK: "true" CLOUD_PROVIDER: "colo" TESTNET_TAG: "colo-edge-perf-gpu-enabled" + ENABLE_GPU: "true" RAMP_UP_TIME: 0 TEST_DURATION_SECONDS: 600 NUMBER_OF_VALIDATOR_NODES: 4 diff --git a/system-test/testnet-performance/gce-cpu-only-perf.yml b/system-test/testnet-performance/gce-cpu-only-perf-10-node.yml similarity index 96% rename from system-test/testnet-performance/gce-cpu-only-perf.yml rename to system-test/testnet-performance/gce-cpu-only-perf-10-node.yml index 1fc39359e..01b460857 100755 --- a/system-test/testnet-performance/gce-cpu-only-perf.yml +++ b/system-test/testnet-performance/gce-cpu-only-perf-10-node.yml @@ -8,6 +8,7 @@ steps: RAMP_UP_TIME: 60 TEST_DURATION_SECONDS: 300 NUMBER_OF_VALIDATOR_NODES: 10 + ENABLE_GPU: "false" VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" NUMBER_OF_CLIENT_NODES: 1 CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000" diff --git a/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml b/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml index c72a9d6e1..f19120898 100755 --- a/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml +++ b/system-test/testnet-performance/gce-cpu-only-perf-5-node.yml @@ -8,6 +8,8 @@ steps: RAMP_UP_TIME: 0 TEST_DURATION_SECONDS: 600 NUMBER_OF_VALIDATOR_NODES: 5 + ENABLE_GPU: "false" + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" NUMBER_OF_CLIENT_NODES: 2 CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" diff --git a/system-test/testnet-performance/gce-gpu-perf-5-node.yml b/system-test/testnet-performance/gce-gpu-perf-5-node.yml index 9b8e63c35..785bd4234 100755 --- a/system-test/testnet-performance/gce-gpu-perf-5-node.yml +++ b/system-test/testnet-performance/gce-gpu-perf-5-node.yml @@ -8,6 +8,7 @@ steps: RAMP_UP_TIME: 0 TEST_DURATION_SECONDS: 600 NUMBER_OF_VALIDATOR_NODES: 5 + ENABLE_GPU: "true" VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" NUMBER_OF_CLIENT_NODES: 2 CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" diff --git a/system-test/testnet-performance/gce-gpu-perf.yml b/system-test/testnet-performance/gce-gpu-perf-50-node.yml similarity index 88% rename from system-test/testnet-performance/gce-gpu-perf.yml rename to system-test/testnet-performance/gce-gpu-perf-50-node.yml index 81163877f..e3cd2eea7 100755 --- a/system-test/testnet-performance/gce-gpu-perf.yml +++ b/system-test/testnet-performance/gce-gpu-perf-50-node.yml @@ -8,10 +8,11 @@ steps: RAMP_UP_TIME: 0 TEST_DURATION_SECONDS: 600 NUMBER_OF_VALIDATOR_NODES: 50 + ENABLE_GPU: "true" VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" NUMBER_OF_CLIENT_NODES: 2 CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" - ADDITIONAL_FLAGS: "" + ADDITIONAL_FLAGS: "--dedicated --allow-boot-failures" agents: - "queue=testnet-deploy" diff --git a/system-test/testnet-performance/testnet-automation.sh b/system-test/testnet-performance/testnet-automation.sh index 20c833749..6ef5cf131 100755 --- a/system-test/testnet-performance/testnet-automation.sh +++ b/system-test/testnet-performance/testnet-automation.sh @@ -74,16 +74,18 @@ function launchTestnet() { case $CLOUD_PROVIDER in gce) # shellcheck disable=SC2068 + # shellcheck disable=SC2086 net/gce.sh create \ -d pd-ssd \ -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \ - "$maybeMachineType" "$VALIDATOR_NODE_MACHINE_TYPE" \ + $maybeCustomMachineType $VALIDATOR_NODE_MACHINE_TYPE "$maybeEnableGpu" \ -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} ${ADDITIONAL_FLAGS[@]/#/" "} ;; colo) # shellcheck disable=SC2068 + # shellcheck disable=SC2086 net/colo.sh create \ - -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" -g \ + -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" "$maybeEnableGpu" \ -p "$TESTNET_TAG" ${ADDITIONAL_FLAGS[@]/#/" "} ;; *) @@ -169,6 +171,13 @@ if [[ -z $NUMBER_OF_VALIDATOR_NODES ]] ; then exit 1 fi +if [[ -z $ENABLE_GPU ]] ; then + ENABLE_GPU=false +fi +if [[ "$ENABLE_GPU" = "true" ]] ; then + maybeEnableGpu="--enable-gpu" +fi + if [[ -z $NUMBER_OF_CLIENT_NODES ]] ; then echo NUMBER_OF_CLIENT_NODES not defined exit 1 @@ -193,7 +202,7 @@ source ci/upload-ci-artifact.sh source system-test/testnet-performance/upload_results_to_slack.sh maybeClientOptions=${CLIENT_OPTIONS:+"-c"} -maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"} +maybeCustomMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"--custom-machine-type"} IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}" @@ -203,6 +212,7 @@ RESULT_DETAILS="Test failed to finish" TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \ NUMBER_OF_VALIDATOR_NODES \ + ENABLE_GPU \ VALIDATOR_NODE_MACHINE_TYPE \ NUMBER_OF_CLIENT_NODES \ CLIENT_OPTIONS \