Add partial network outage before test functionality (#17291)
* Add partial network outage before testing functionality Allow a percentage of the stake to go offline before an automation test starts * code review fixes
This commit is contained in:
parent
a35024123c
commit
e3f5c0005b
|
@ -705,7 +705,7 @@ stopNode() {
|
||||||
declare pid=$!
|
declare pid=$!
|
||||||
ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log"
|
ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log"
|
||||||
if $block; then
|
if $block; then
|
||||||
wait $pid
|
wait $pid || true
|
||||||
else
|
else
|
||||||
pids+=("$pid")
|
pids+=("$pid")
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -60,7 +60,7 @@ function analyze_packet_loss {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
function wait_for_bootstrap_validator_stake_drop {
|
function wait_for_max_stake {
|
||||||
max_stake="$1"
|
max_stake="$1"
|
||||||
if [[ $max_stake -eq 100 ]]; then
|
if [[ $max_stake -eq 100 ]]; then
|
||||||
return
|
return
|
||||||
|
@ -74,6 +74,16 @@ function wait_for_bootstrap_validator_stake_drop {
|
||||||
ssh "${sshOptions[@]}" "${validatorIpList[0]}" "RUST_LOG=info \$HOME/.cargo/bin/solana wait-for-max-stake $max_stake --url http://127.0.0.1:8899"
|
ssh "${sshOptions[@]}" "${validatorIpList[0]}" "RUST_LOG=info \$HOME/.cargo/bin/solana wait-for-max-stake $max_stake --url http://127.0.0.1:8899"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function wait_for_equal_stake {
|
||||||
|
source "${REPO_ROOT}"/net/common.sh
|
||||||
|
loadConfigFile
|
||||||
|
|
||||||
|
max_stake=$((100 / ${#validatorIpList[@]} + 1))
|
||||||
|
execution_step "Waiting for max stake to fall below ${max_stake}%"
|
||||||
|
|
||||||
|
wait_for_max_stake $max_stake
|
||||||
|
}
|
||||||
|
|
||||||
function get_slot {
|
function get_slot {
|
||||||
source "${REPO_ROOT}"/net/common.sh
|
source "${REPO_ROOT}"/net/common.sh
|
||||||
loadConfigFile
|
loadConfigFile
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
steps:
|
||||||
|
- command: "system-test/testnet-automation.sh"
|
||||||
|
label: "GCE - CPU Only 5 Node - 20% network offline with 2 partitions"
|
||||||
|
env:
|
||||||
|
UPLOAD_RESULTS_TO_SLACK: "true"
|
||||||
|
CLOUD_PROVIDER: "gce"
|
||||||
|
TESTNET_TAG: "gce-perf-cpu-only"
|
||||||
|
NUMBER_OF_VALIDATOR_NODES: 4
|
||||||
|
ENABLE_GPU: "false"
|
||||||
|
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
|
||||||
|
NUMBER_OF_CLIENT_NODES: 1
|
||||||
|
CLIENT_OPTIONS: "bench-tps=1=--tx_count 10000 --thread-batch-sleep-ms 250"
|
||||||
|
TESTNET_ZONES: "us-west1-a"
|
||||||
|
USE_PUBLIC_IP_ADDRESSES: "true"
|
||||||
|
ADDITIONAL_FLAGS: "--dedicated"
|
||||||
|
APPLY_PARTITIONS: "true"
|
||||||
|
NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions"
|
||||||
|
PARTITION_ACTIVE_DURATION: 30
|
||||||
|
PARTITION_INACTIVE_DURATION: 30
|
||||||
|
PARTITION_ITERATION_COUNT: 5
|
||||||
|
TEST_TYPE: "partition"
|
||||||
|
EXTRA_PRIMORDIAL_STAKES: 4
|
||||||
|
WAIT_FOR_EQUAL_STAKE: "true"
|
||||||
|
WARMUP_SLOTS_BEFORE_TEST: 400
|
||||||
|
NUMBER_OF_OFFLINE_NODES: 1
|
||||||
|
agents:
|
||||||
|
- "queue=gce-deploy"
|
|
@ -142,8 +142,12 @@ function launch_testnet() {
|
||||||
-c idle=$NUMBER_OF_CLIENT_NODES $maybeStartAllowBootFailures \
|
-c idle=$NUMBER_OF_CLIENT_NODES $maybeStartAllowBootFailures \
|
||||||
--gpu-mode $startGpuMode $maybeWarpSlot $maybeAsyncNodeInit $maybeExtraPrimordialStakes
|
--gpu-mode $startGpuMode $maybeWarpSlot $maybeAsyncNodeInit $maybeExtraPrimordialStakes
|
||||||
|
|
||||||
|
if [[ -n "$WAIT_FOR_EQUAL_STAKE" ]]; then
|
||||||
|
wait_for_equal_stake
|
||||||
|
else
|
||||||
execution_step "Waiting for bootstrap validator's stake to fall below ${BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD}%"
|
execution_step "Waiting for bootstrap validator's stake to fall below ${BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD}%"
|
||||||
wait_for_bootstrap_validator_stake_drop "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD"
|
wait_for_max_stake "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD"
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $NUMBER_OF_CLIENT_NODES -gt 0 ]]; then
|
if [[ $NUMBER_OF_CLIENT_NODES -gt 0 ]]; then
|
||||||
execution_step "Starting ${NUMBER_OF_CLIENT_NODES} client nodes"
|
execution_step "Starting ${NUMBER_OF_CLIENT_NODES} client nodes"
|
||||||
|
@ -153,6 +157,24 @@ function launch_testnet() {
|
||||||
sleep 180
|
sleep 180
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$WARMUP_SLOTS_BEFORE_TEST" ]]; then
|
||||||
|
# Allow the network to run for a bit before beginning the test
|
||||||
|
while [[ "$WARMUP_SLOTS_BEFORE_TEST" -gt $(get_slot) ]]; do
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Stop the specified number of nodes
|
||||||
|
num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 ))
|
||||||
|
if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then
|
||||||
|
execution_step "Stopping $NUMBER_OF_OFFLINE_NODES nodes"
|
||||||
|
for (( i=NUMBER_OF_VALIDATOR_NODES; i>$(( NUMBER_OF_VALIDATOR_NODES - NUMBER_OF_OFFLINE_NODES )); i-- )); do
|
||||||
|
# shellcheck disable=SC2154
|
||||||
|
"${REPO_ROOT}"/net/net.sh stopnode -i "${validatorIpList[$i]}"
|
||||||
|
done
|
||||||
|
num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES ))
|
||||||
|
fi
|
||||||
|
|
||||||
SECONDS=0
|
SECONDS=0
|
||||||
START_SLOT=$(get_slot)
|
START_SLOT=$(get_slot)
|
||||||
SLOT_COUNT_START_SECONDS=$SECONDS
|
SLOT_COUNT_START_SECONDS=$SECONDS
|
||||||
|
@ -170,11 +192,11 @@ function launch_testnet() {
|
||||||
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
|
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
|
||||||
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
|
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
|
||||||
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
|
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
|
||||||
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE"
|
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes
|
||||||
sleep "$PARTITION_ACTIVE_DURATION"
|
sleep "$PARTITION_ACTIVE_DURATION"
|
||||||
|
|
||||||
execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
|
execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
|
||||||
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup
|
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes
|
||||||
sleep "$PARTITION_INACTIVE_DURATION"
|
sleep "$PARTITION_INACTIVE_DURATION"
|
||||||
done
|
done
|
||||||
STATS_FINISH_SECONDS=$SECONDS
|
STATS_FINISH_SECONDS=$SECONDS
|
||||||
|
@ -325,6 +347,9 @@ TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
|
||||||
ADDITIONAL_FLAGS \
|
ADDITIONAL_FLAGS \
|
||||||
APPLY_PARTITIONS \
|
APPLY_PARTITIONS \
|
||||||
NETEM_CONFIG_FILE \
|
NETEM_CONFIG_FILE \
|
||||||
|
WAIT_FOR_EQUAL_STAKE \
|
||||||
|
WARMUP_SLOTS_BEFORE_TEST \
|
||||||
|
NUMBER_OF_OFFLINE_NODES \
|
||||||
PARTITION_ACTIVE_DURATION \
|
PARTITION_ACTIVE_DURATION \
|
||||||
PARTITION_INACTIVE_DURATION \
|
PARTITION_INACTIVE_DURATION \
|
||||||
PARTITION_ITERATION_COUNT \
|
PARTITION_ITERATION_COUNT \
|
||||||
|
|
Loading…
Reference in New Issue