Implement allowing validator boot failure into automation (#6589)
* Pass allow boot failures through create AND start * Extend sleep timeout to all nodes * Add 100 node testcase * Reduce consistent sleep
This commit is contained in:
parent
85ccba366a
commit
9ee65009cd
|
@ -722,11 +722,7 @@ deploy() {
|
|||
# Stagger additional node start time. If too many nodes start simultaneously
|
||||
# the bootstrap node gets more rsync requests from the additional nodes than
|
||||
# it can handle.
|
||||
if ((nodeIndex % 3 == 0)); then
|
||||
sleep 2
|
||||
elif ((nodeIndex % 3 == 1)); then
|
||||
sleep 4
|
||||
fi
|
||||
sleep 2
|
||||
fi
|
||||
done
|
||||
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
steps:
|
||||
- command: "system-test/testnet-performance/testnet-automation.sh"
|
||||
label: "GCE - GPU Enabled 100 Nodes"
|
||||
env:
|
||||
UPLOAD_RESULTS_TO_SLACK: "true"
|
||||
CLOUD_PROVIDER: "gce"
|
||||
TESTNET_TAG: "gce-edge-perf-gpu-enabled"
|
||||
RAMP_UP_TIME: 0
|
||||
TEST_DURATION_SECONDS: 600
|
||||
NUMBER_OF_VALIDATOR_NODES: 100
|
||||
ENABLE_GPU: "true"
|
||||
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
|
||||
NUMBER_OF_CLIENT_NODES: 2
|
||||
CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
|
||||
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
|
||||
ALLOW_BOOT_FAILURES: "true"
|
||||
ADDITIONAL_FLAGS: "--dedicated"
|
||||
agents:
|
||||
- "queue=testnet-deploy"
|
|
@ -13,6 +13,7 @@ steps:
|
|||
NUMBER_OF_CLIENT_NODES: 2
|
||||
CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
|
||||
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
|
||||
ADDITIONAL_FLAGS: "--dedicated --allow-boot-failures"
|
||||
ALLOW_BOOT_FAILURES: "true"
|
||||
ADDITIONAL_FLAGS: "--dedicated"
|
||||
agents:
|
||||
- "queue=testnet-deploy"
|
||||
|
|
|
@ -83,7 +83,9 @@ function launchTestnet() {
|
|||
-d pd-ssd \
|
||||
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
||||
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
||||
-p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} ${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures \
|
||||
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
||||
${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
;;
|
||||
colo)
|
||||
# shellcheck disable=SC2068
|
||||
|
@ -102,9 +104,13 @@ function launchTestnet() {
|
|||
|
||||
echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test
|
||||
if [[ -n $CHANNEL ]]; then
|
||||
net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures
|
||||
else
|
||||
net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures
|
||||
fi
|
||||
|
||||
echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize
|
||||
|
@ -196,6 +202,14 @@ if [[ -z $SOLANA_METRICS_CONFIG ]]; then
|
|||
fi
|
||||
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
|
||||
|
||||
if [[ -z $ALLOW_BOOT_FAILURES ]] ; then
|
||||
ALLOW_BOOT_FAILURES=false
|
||||
fi
|
||||
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]] ; then
|
||||
maybeCreateAllowBootFailures="--allow-boot-failures"
|
||||
maybeStartAllowBootFailures="-F"
|
||||
fi
|
||||
|
||||
if [[ -z $CHANNEL ]]; then
|
||||
echo --- downloading tar from build artifacts
|
||||
buildkite-agent artifact download "solana-release*.tar.bz2" .
|
||||
|
@ -222,6 +236,7 @@ TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
|
|||
CLIENT_OPTIONS \
|
||||
TESTNET_ZONES \
|
||||
TEST_DURATION_SECONDS \
|
||||
ALLOW_BOOT_FAILURES \
|
||||
ADDITIONAL_FLAGS)
|
||||
|
||||
TEST_CONFIGURATION=
|
||||
|
|
Loading…
Reference in New Issue