From 9ee65009cd54e9d6fca8f48851b3c27895137672 Mon Sep 17 00:00:00 2001 From: Dan Albert Date: Mon, 28 Oct 2019 16:43:40 -0600 Subject: [PATCH] Implement allowing validator boot failure into automation (#6589) * Pass allow boot failures through create AND start * Extend sleep timeout to all nodes * Add 100 node testcase * Reduce consistent sleep --- net/net.sh | 6 +----- .../gce-gpu-perf-100-node.yml | 19 +++++++++++++++++ .../gce-gpu-perf-50-node.yml | 3 ++- .../testnet-performance/testnet-automation.sh | 21 ++++++++++++++++--- 4 files changed, 40 insertions(+), 9 deletions(-) create mode 100755 system-test/testnet-performance/gce-gpu-perf-100-node.yml diff --git a/net/net.sh b/net/net.sh index 5be1b700a..9c61cc9b8 100755 --- a/net/net.sh +++ b/net/net.sh @@ -722,11 +722,7 @@ deploy() { # Stagger additional node start time. If too many nodes start simultaneously # the bootstrap node gets more rsync requests from the additional nodes than # it can handle. - if ((nodeIndex % 3 == 0)); then - sleep 2 - elif ((nodeIndex % 3 == 1)); then - sleep 4 - fi + sleep 2 fi done diff --git a/system-test/testnet-performance/gce-gpu-perf-100-node.yml b/system-test/testnet-performance/gce-gpu-perf-100-node.yml new file mode 100755 index 000000000..0a63d0c35 --- /dev/null +++ b/system-test/testnet-performance/gce-gpu-perf-100-node.yml @@ -0,0 +1,19 @@ +steps: + - command: "system-test/testnet-performance/testnet-automation.sh" + label: "GCE - GPU Enabled 100 Nodes" + env: + UPLOAD_RESULTS_TO_SLACK: "true" + CLOUD_PROVIDER: "gce" + TESTNET_TAG: "gce-edge-perf-gpu-enabled" + RAMP_UP_TIME: 0 + TEST_DURATION_SECONDS: 600 + NUMBER_OF_VALIDATOR_NODES: 100 + ENABLE_GPU: "true" + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" + NUMBER_OF_CLIENT_NODES: 2 + CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" + TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" + ALLOW_BOOT_FAILURES: "true" + ADDITIONAL_FLAGS: "--dedicated" + agents: + - "queue=testnet-deploy" diff --git a/system-test/testnet-performance/gce-gpu-perf-50-node.yml b/system-test/testnet-performance/gce-gpu-perf-50-node.yml index 20980b94d..7d4fd84d9 100755 --- a/system-test/testnet-performance/gce-gpu-perf-50-node.yml +++ b/system-test/testnet-performance/gce-gpu-perf-50-node.yml @@ -13,6 +13,7 @@ steps: NUMBER_OF_CLIENT_NODES: 2 CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" - ADDITIONAL_FLAGS: "--dedicated --allow-boot-failures" + ALLOW_BOOT_FAILURES: "true" + ADDITIONAL_FLAGS: "--dedicated" agents: - "queue=testnet-deploy" diff --git a/system-test/testnet-performance/testnet-automation.sh b/system-test/testnet-performance/testnet-automation.sh index cf5371385..ab210b708 100755 --- a/system-test/testnet-performance/testnet-automation.sh +++ b/system-test/testnet-performance/testnet-automation.sh @@ -83,7 +83,9 @@ function launchTestnet() { -d pd-ssd \ -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \ $maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \ - -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} ${ADDITIONAL_FLAGS[@]/#/" "} + -p "$TESTNET_TAG" $maybeCreateAllowBootFailures \ + ${TESTNET_CLOUD_ZONES[@]/#/"-z "} \ + ${ADDITIONAL_FLAGS[@]/#/" "} ;; colo) # shellcheck disable=SC2068 @@ -102,9 +104,13 @@ function launchTestnet() { echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test if [[ -n $CHANNEL ]]; then - net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" + # shellcheck disable=SC2068 + # shellcheck disable=SC2086 + net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures else - net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" + # shellcheck disable=SC2068 + # shellcheck disable=SC2086 + net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures fi echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize @@ -196,6 +202,14 @@ if [[ -z $SOLANA_METRICS_CONFIG ]]; then fi echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG" +if [[ -z $ALLOW_BOOT_FAILURES ]] ; then + ALLOW_BOOT_FAILURES=false +fi +if [[ "$ALLOW_BOOT_FAILURES" = "true" ]] ; then + maybeCreateAllowBootFailures="--allow-boot-failures" + maybeStartAllowBootFailures="-F" +fi + if [[ -z $CHANNEL ]]; then echo --- downloading tar from build artifacts buildkite-agent artifact download "solana-release*.tar.bz2" . @@ -222,6 +236,7 @@ TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \ CLIENT_OPTIONS \ TESTNET_ZONES \ TEST_DURATION_SECONDS \ + ALLOW_BOOT_FAILURES \ ADDITIONAL_FLAGS) TEST_CONFIGURATION=