Implement automated partition testing (#7222)

This commit is contained in:
Dan Albert 2020-01-22 13:46:50 -05:00 committed by GitHub
parent 023074650f
commit 7587656cf6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 192 additions and 45 deletions

View File

@ -1009,6 +1009,7 @@ logs)
;;
netem)
if [[ -n $netemConfigFile ]]; then
remoteNetemConfigFile="$(basename "$netemConfigFile")"
if [[ $netemCommand = "add" ]]; then
for ipAddress in "${validatorIpList[@]}"; do
"$here"/scp.sh "$netemConfigFile" solana@"$ipAddress":~/solana
@ -1016,7 +1017,7 @@ netem)
fi
for i in "${!validatorIpList[@]}"; do
"$here"/ssh.sh solana@"${validatorIpList[$i]}" 'solana/scripts/net-shaper.sh' \
"$netemCommand" ~solana/solana/"$netemConfigFile" "${#validatorIpList[@]}" "$i"
"$netemCommand" ~solana/solana/"$remoteNetemConfigFile" "${#validatorIpList[@]}" "$i"
done
else
num_nodes=$((${#validatorIpList[@]}*netemPartition/100))

View File

@ -0,0 +1,39 @@
{
"partitions":[
34,
33,
33
],
"interconnects":[
{
"a":0,
"b":1,
"config":"loss 15% delay 25ms"
},
{
"a":1,
"b":0,
"config":"loss 15% delay 25ms"
},
{
"a":0,
"b":2,
"config":"loss 10% delay 15ms"
},
{
"a":2,
"b":0,
"config":"loss 10% delay 15ms"
},
{
"a":2,
"b":1,
"config":"loss 5% delay 5ms"
},
{
"a":1,
"b":2,
"config":"loss 5% delay 5ms"
}
]
}

View File

@ -0,0 +1,19 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Colo - CPU Only - Partial Loss 3 Partitions"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-cpu-only"
NUMBER_OF_VALIDATOR_NODES: 4
ENABLE_GPU: "false"
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
APPLY_PARTITIONS: "true"
NETEM_CONFIG_FILE: "system-test/netem-configs/partial-loss-three-partitions"
PARTITION_ACTIVE_DURATION: 30
PARTITION_INACTIVE_DURATION: 30
PARTITION_ITERATION_COUNT: 5
agents:
- "queue=colo-deploy"

View File

@ -0,0 +1,19 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Colo - CPU Only - Partial Loss 3 Partitions"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-cpu-only"
NUMBER_OF_VALIDATOR_NODES: 4
ENABLE_GPU: "false"
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 15000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
APPLY_PARTITIONS: "true"
NETEM_CONFIG_FILE: "system-test/netem-configs/partial-loss-three-partitions"
PARTITION_ACTIVE_DURATION: 60
PARTITION_INACTIVE_DURATION: 60
PARTITION_ITERATION_COUNT: 10
agents:
- "queue=colo-deploy"

View File

@ -0,0 +1,19 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Colo - CPU Only - Partial Loss 3 Partitions"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-cpu-only"
NUMBER_OF_VALIDATOR_NODES: 4
ENABLE_GPU: "false"
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 15000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
APPLY_PARTITIONS: "true"
NETEM_CONFIG_FILE: "system-test/netem-configs/partial-loss-three-partitions"
PARTITION_ACTIVE_DURATION: 30
PARTITION_INACTIVE_DURATION: 30
PARTITION_ITERATION_COUNT: 2
agents:
- "queue=colo-deploy"

View File

@ -0,0 +1,22 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "GCE - CPU Only 5 Node - Partial Loss 3 Partitions"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-cpu-only"
NUMBER_OF_VALIDATOR_NODES: 5
ENABLE_GPU: "false"
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
TESTNET_ZONES: "us-west1-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: ""
APPLY_PARTITIONS: "true"
NETEM_CONFIG_FILE: "system-test/netem-configs/partial-loss-three-partitions"
PARTITION_ACTIVE_DURATION: 30
PARTITION_INACTIVE_DURATION: 30
PARTITION_ITERATION_COUNT: 5
agents:
- "queue=testnet-deploy"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "ec2"
TESTNET_TAG: "aws-perf-cpu-only"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 10
ENABLE_GPU: "false"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "ec2"
TESTNET_TAG: "aws-perf-cpu-only"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 5
ENABLE_GPU: "false"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "azure"
TESTNET_TAG: "azure-perf-cpu-only"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 5
ENABLE_GPU: "false"

View File

@ -6,7 +6,6 @@ steps:
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-cpu-only"
ENABLE_GPU: "false"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 4
NUMBER_OF_CLIENT_NODES: 2

View File

@ -6,7 +6,6 @@ steps:
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-gpu-enabled"
ENABLE_GPU: "true"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 4
NUMBER_OF_CLIENT_NODES: 2

View File

@ -6,7 +6,6 @@ steps:
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-gpu-enabled"
ENABLE_GPU: "true"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 4
NUMBER_OF_CLIENT_NODES: 2

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-cpu-only"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 10
ENABLE_GPU: "false"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-cpu-only"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 5
ENABLE_GPU: "false"
@ -14,6 +13,6 @@ steps:
CLIENT_OPTIONS: "bench-tps=2=--tx_count 20000 --thread-batch-sleep-ms 250"
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
ADDITIONAL_FLAGS: ""
agents:
- "queue=gce-deploy"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-gpu-enabled"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 10
ENABLE_GPU: "true"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-gpu-enabled"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 100
ENABLE_GPU: "true"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-gpu-enabled"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 25
ENABLE_GPU: "true"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-gpu-enabled"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 5
ENABLE_GPU: "true"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-gpu-enabled"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 600
NUMBER_OF_VALIDATOR_NODES: 50
ENABLE_GPU: "true"

View File

@ -5,7 +5,6 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-stability"
RAMP_UP_TIME: 0
TEST_DURATION_SECONDS: 28800
NUMBER_OF_VALIDATOR_NODES: 5
ENABLE_GPU: "false"

View File

@ -11,7 +11,7 @@ function collect_logs {
execution_step "Collect logs from remote nodes"
rm -rf net/log
net/net.sh logs
for logfile in net/log/* ; do
for logfile in net/log/*; do
(
new_log=net/log/"$TESTNET_TAG"_"$NUMBER_OF_VALIDATOR_NODES"-nodes_"$(basename "$logfile")"
cp "$logfile" "$new_log"
@ -73,7 +73,7 @@ function wait_for_bootstrap_validator_stake_drop {
function cleanup_testnet {
RC=$?
if [[ $RC != 0 ]] ; then
if [[ $RC != 0 ]]; then
RESULT_DETAILS="
Test failed during step:
${STEP}
@ -82,8 +82,8 @@ Failure occured when running the following command:
$(eval echo "$@")"
fi
FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
if [[ "$UPLOAD_RESULTS_TO_SLACK" = "true" ]] ; then
TESTNET_FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
if [[ "$UPLOAD_RESULTS_TO_SLACK" = "true" ]]; then
upload_results_to_slack
fi
@ -118,7 +118,7 @@ function launchTestnet() {
case $CLOUD_PROVIDER in
gce)
if [[ -z $VALIDATOR_NODE_MACHINE_TYPE ]] ; then
if [[ -z $VALIDATOR_NODE_MACHINE_TYPE ]]; then
echo VALIDATOR_NODE_MACHINE_TYPE not defined
exit 1
fi
@ -169,15 +169,15 @@ function launchTestnet() {
net/init-metrics.sh -e
execution_step "Fetch reusable testnet keypairs"
if [[ ! -d net/keypairs ]] ; then
if [[ ! -d net/keypairs ]]; then
git clone git@github.com:solana-labs/testnet-keypairs.git net/keypairs
# If we have provider-specific keys (CoLo*, GCE*, etc) use them instead of generic val*
if [[ -d net/keypairs/"${CLOUD_PROVIDER}" ]] ; then
if [[ -d net/keypairs/"${CLOUD_PROVIDER}" ]]; then
cp net/keypairs/"${CLOUD_PROVIDER}"/* net/keypairs/
fi
fi
if [[ "$CLOUD_PROVIDER" = "colo" ]] ; then
if [[ "$CLOUD_PROVIDER" = "colo" ]]; then
execution_step "Stopping Colo nodes before we start"
net/net.sh stop
fi
@ -200,9 +200,30 @@ function launchTestnet() {
execution_step "Waiting for bootstrap validator's stake percentage to fall below $BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD %"
wait_for_bootstrap_validator_stake_drop "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD"
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
sleep "$TEST_DURATION_SECONDS"
if [[ -n $TEST_DURATION_SECONDS ]]; then
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
sleep "$TEST_DURATION_SECONDS"
elif [[ "$APPLY_PARTITIONS" = "true" ]]; then
STATS_START_SECONDS=$SECONDS
execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
sleep "$PARTITION_INACTIVE_DURATION"
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
net/net.sh netem --config-file "$NETEM_CONFIG_FILE"
sleep "$PARTITION_ACTIVE_DURATION"
execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup
sleep "$PARTITION_INACTIVE_DURATION"
done
STATS_FINISH_SECONDS=$SECONDS
TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
else
# We should never get here
echo Test duration and partition config not defined
fi
execution_step "Collect statistics about run"
declare q_mean_tps='
SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
@ -239,9 +260,16 @@ function launchTestnet() {
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
declare q_max_tower_distance_observed='
SELECT MAX("tower_distance") as "max_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
GROUP BY time(1s), host_id)'
curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
--data-urlencode "db=${TESTNET_TAG}" \
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" |
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed" |
python system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
execution_step "Writing test results to ${RESULT_FILE}"
@ -259,26 +287,21 @@ cd "$(dirname "$0")/.."
[[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://metrics.solana.com:8086
[[ -n $BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD ]] || BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD=66
if [[ -z $TEST_DURATION_SECONDS ]] ; then
echo TEST_DURATION_SECONDS not defined
exit 1
fi
if [[ -z $NUMBER_OF_VALIDATOR_NODES ]] ; then
if [[ -z $NUMBER_OF_VALIDATOR_NODES ]]; then
echo NUMBER_OF_VALIDATOR_NODES not defined
exit 1
fi
startGpuMode="off"
if [[ -z $ENABLE_GPU ]] ; then
if [[ -z $ENABLE_GPU ]]; then
ENABLE_GPU=false
fi
if [[ "$ENABLE_GPU" = "true" ]] ; then
if [[ "$ENABLE_GPU" = "true" ]]; then
maybeEnableGpu="--enable-gpu"
startGpuMode="on"
fi
if [[ -z $NUMBER_OF_CLIENT_NODES ]] ; then
if [[ -z $NUMBER_OF_CLIENT_NODES ]]; then
echo NUMBER_OF_CLIENT_NODES not defined
exit 1
fi
@ -292,23 +315,36 @@ if [[ -z $SOLANA_METRICS_CONFIG ]]; then
fi
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
if [[ -z $ALLOW_BOOT_FAILURES ]] ; then
if [[ -z $ALLOW_BOOT_FAILURES ]]; then
ALLOW_BOOT_FAILURES=false
fi
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]] ; then
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]]; then
maybeCreateAllowBootFailures="--allow-boot-failures"
maybeStartAllowBootFailures="-F"
fi
if [[ -z $USE_PUBLIC_IP_ADDRESSES ]] ; then
if [[ -z $USE_PUBLIC_IP_ADDRESSES ]]; then
USE_PUBLIC_IP_ADDRESSES=false
fi
if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]] ; then
if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]]; then
maybePublicIpAddresses="-P"
fi
: "${CLIENT_DELAY_START:=0}"
if [[ -z $APPLY_PARTITIONS ]]; then
APPLY_PARTITIONS=false
fi
if [[ "$APPLY_PARTITIONS" = "true" ]]; then
if [[ -n $TEST_DURATION_SECONDS ]]; then
echo Cannot accept TEST_DURATION_SECONDS and a parition looping config
exit 1
fi
elif [[ -z $TEST_DURATION_SECONDS ]]; then
echo TEST_DURATION_SECONDS not defined
exit 1
fi
if [[ -z $CHANNEL ]]; then
execution_step "Downloading tar from build artifacts"
buildkite-agent artifact download "solana-release*.tar.bz2" .
@ -337,15 +373,21 @@ TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
TEST_DURATION_SECONDS \
USE_PUBLIC_IP_ADDRESSES \
ALLOW_BOOT_FAILURES \
ADDITIONAL_FLAGS)
ADDITIONAL_FLAGS \
APPLY_PARTITIONS \
NETEM_CONFIG_FILE \
PARTITION_ACTIVE_DURATION \
PARTITION_INACTIVE_DURATION \
PARTITION_ITERATION_COUNT \
)
TEST_CONFIGURATION=
for i in "${TEST_PARAMS_TO_DISPLAY[@]}" ; do
if [[ -n ${!i} ]] ; then
for i in "${TEST_PARAMS_TO_DISPLAY[@]}"; do
if [[ -n ${!i} ]]; then
TEST_CONFIGURATION+="${i} = ${!i} | "
fi
done
START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
TESTNET_START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
launchTestnet

View File

@ -19,7 +19,7 @@ upload_results_to_slack() {
BUILDKITE_BUILD_URL="https://buildkite.com/solana-labs/"
fi
GRAFANA_URL="https://metrics.solana.com:3000/d/testnet-${CHANNEL:-edge}/testnet-monitor-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${START_UNIX_MSECS:-0}&to=${FINISH_UNIX_MSECS:-0}"
GRAFANA_URL="https://metrics.solana.com:3000/d/testnet-${CHANNEL:-edge}/testnet-monitor-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${TESTNET_START_UNIX_MSECS:-0}&to=${TESTNET_FINISH_UNIX_MSECS:-0}"
[[ -n $RESULT_DETAILS ]] || RESULT_DETAILS="Undefined"
[[ -n $TEST_CONFIGURATION ]] || TEST_CONFIGURATION="Undefined"