Implement allowing validator boot failure into automation (#6589)

* Pass allow boot failures through create AND start

* Extend sleep timeout to all nodes

* Add 100 node testcase

* Reduce consistent sleep
This commit is contained in:
Dan Albert 2019-10-28 16:43:40 -06:00 committed by GitHub
parent 85ccba366a
commit 9ee65009cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 9 deletions

View File

@ -722,11 +722,7 @@ deploy() {
# Stagger additional node start time. If too many nodes start simultaneously
# the bootstrap node gets more rsync requests from the additional nodes than
# it can handle.
if ((nodeIndex % 3 == 0)); then
sleep 2
elif ((nodeIndex % 3 == 1)); then
sleep 4
fi
sleep 2
fi
done

View File

@ -0,0 +1,19 @@
steps:
- command: "system-test/testnet-performance/testnet-automation.sh"
label: "GCE - GPU Enabled 100 Nodes"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-edge-perf-gpu-enabled"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 100
ENABLE_GPU: "true"
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
ALLOW_BOOT_FAILURES: "true"
ADDITIONAL_FLAGS: "--dedicated"
agents:
- "queue=testnet-deploy"

View File

@ -13,6 +13,7 @@ steps:
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
ADDITIONAL_FLAGS: "--dedicated --allow-boot-failures"
ALLOW_BOOT_FAILURES: "true"
ADDITIONAL_FLAGS: "--dedicated"
agents:
- "queue=testnet-deploy"

View File

@ -83,7 +83,9 @@ function launchTestnet() {
-d pd-ssd \
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
-p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} ${ADDITIONAL_FLAGS[@]/#/" "}
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures \
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
${ADDITIONAL_FLAGS[@]/#/" "}
;;
colo)
# shellcheck disable=SC2068
@ -102,9 +104,13 @@ function launchTestnet() {
echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test
if [[ -n $CHANNEL ]]; then
net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
# shellcheck disable=SC2068
# shellcheck disable=SC2086
net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures
else
net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
# shellcheck disable=SC2068
# shellcheck disable=SC2086
net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures
fi
echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize
@ -196,6 +202,14 @@ if [[ -z $SOLANA_METRICS_CONFIG ]]; then
fi
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
if [[ -z $ALLOW_BOOT_FAILURES ]] ; then
ALLOW_BOOT_FAILURES=false
fi
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]] ; then
maybeCreateAllowBootFailures="--allow-boot-failures"
maybeStartAllowBootFailures="-F"
fi
if [[ -z $CHANNEL ]]; then
echo --- downloading tar from build artifacts
buildkite-agent artifact download "solana-release*.tar.bz2" .
@ -222,6 +236,7 @@ TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
CLIENT_OPTIONS \
TESTNET_ZONES \
TEST_DURATION_SECONDS \
ALLOW_BOOT_FAILURES \
ADDITIONAL_FLAGS)
TEST_CONFIGURATION=