From c56fb0f014786e5280c7dac17a43db9b8d071635 Mon Sep 17 00:00:00 2001 From: Ashwin Sekar Date: Mon, 8 Nov 2021 06:51:57 -0800 Subject: [PATCH] Add system test to measure recovery after partition (#20902) * Add system test to measure recovery after partition * shellcheck * increase partition length until failure * adjust parameters and output * different stopping condition --- system-test/automation_utils.sh | 15 ++++ .../gce-partition-recovery.yml | 22 +++++ .../measure-partition-recovery.sh | 81 +++++++++++++++++++ system-test/testnet-automation-json-parser.py | 10 ++- 4 files changed, 125 insertions(+), 3 deletions(-) create mode 100755 system-test/partition-testcases/gce-partition-recovery.yml create mode 100755 system-test/partition-testcases/measure-partition-recovery.sh diff --git a/system-test/automation_utils.sh b/system-test/automation_utils.sh index 332b084344..f4a8d40561 100755 --- a/system-test/automation_utils.sh +++ b/system-test/automation_utils.sh @@ -110,6 +110,21 @@ function get_current_stake { '$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalCurrentStake\": [0-9]*" | cut -d: -f2' } +function get_validator_confirmation_time { + SINCE=$1 + declare q_mean_confirmation=' + SELECT ROUND(MEAN("duration_ms")) as "mean_confirmation_ms" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$SINCE"'s' + + mean_confirmation_ms=$( \ + curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ + --data-urlencode "db=${TESTNET_TAG}" \ + --data-urlencode "q=$q_mean_confirmation" | + python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py --empty_error | + cut -d' ' -f2) +} + function collect_performance_statistics { execution_step "Collect performance statistics about run" declare q_mean_tps=' diff --git a/system-test/partition-testcases/gce-partition-recovery.yml b/system-test/partition-testcases/gce-partition-recovery.yml new file mode 100755 index 0000000000..2645fc88fe --- /dev/null +++ b/system-test/partition-testcases/gce-partition-recovery.yml @@ -0,0 +1,22 @@ +steps: + - command: "system-test/testnet-automation.sh" + label: "Partition recovery on GCE" + env: + UPLOAD_RESULTS_TO_SLACK: "true" + CLOUD_PROVIDER: "gce" + ENABLE_GPU: "false" + NUMBER_OF_VALIDATOR_NODES: 9 + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" + NUMBER_OF_CLIENT_NODES: 1 + ADDITIONAL_FLAGS: "--dedicated" + SKIP_PERF_RESULTS: "true" + EXTRA_PRIMORDIAL_STAKES: 4 + TEST_TYPE: "script" + WARMUP_SLOTS_BEFORE_TEST: 400 + PRE_PARTITION_DURATION: 120 + PARTITION_DURATION: 360 + PARTITION_INCREMENT: 60 + NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions" + CUSTOM_SCRIPT: "system-test/partition-testcases/measure-partition-recovery.sh" + agents: + - "queue=gce-deploy" diff --git a/system-test/partition-testcases/measure-partition-recovery.sh b/system-test/partition-testcases/measure-partition-recovery.sh new file mode 100755 index 0000000000..3c1df03bde --- /dev/null +++ b/system-test/partition-testcases/measure-partition-recovery.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +set -ex + +# shellcheck disable=SC1090 +# shellcheck disable=SC1091 +source "$(dirname "$0")"/../automation_utils.sh + +RESULT_FILE="$1" + +[[ -n $TESTNET_TAG ]] || TESTNET_TAG=${CLOUD_PROVIDER}-testnet-automation + +if [[ -z $NETEM_CONFIG_FILE ]]; then + echo "Error: For this test NETEM_CONFIG_FILE must be specified" + exit 1 +fi + +if [[ -z $PRE_PARTITION_DURATION ]]; then + PRE_PARTITION_DURATION=60 +fi + +if [[ -z $PARTITION_DURATION ]]; then + PARTITION_DURATION=300 +fi + +if [[ -z $PARTITION_INCREMENT ]]; then + PARTITION_INCREMENT=60 +fi + +num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 )) +if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then + num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES )) +fi + +execution_step "Measuring validator confirmation time for $PRE_PARTITION_DURATION seconds" +sleep "$PRE_PARTITION_DURATION" +get_validator_confirmation_time "$PRE_PARTITION_DURATION" +# shellcheck disable=SC2154 +execution_step "Pre partition validator confirmation time is $mean_confirmation_ms ms" +echo "Pre partition validator confirmation time: $mean_confirmation_ms ms" >> "$RESULT_FILE" +target=$mean_confirmation_ms + +while true; do + execution_step "Applying partition config $NETEM_CONFIG_FILE for $PARTITION_DURATION seconds" + echo "Partitioning for $PARTITION_DURATION seconds" >> "$RESULT_FILE" + "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes + sleep "$PARTITION_DURATION" + + execution_step "Resolving partition" + "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes + + get_validator_confirmation_time 10 + SECONDS=0 + + # This happens when we haven't confirmed anything recently so the query returns an empty string + while [[ -z $mean_confirmation_ms ]]; do + sleep 5 + get_validator_confirmation_time 10 + if [[ $SECONDS -gt $PARTITION_DURATION ]]; then + echo " No confirmations seen after $SECONDS seconds" >> "$RESULT_FILE" + exit 0 + fi + done + echo " Validator confirmation is $mean_confirmation_ms ms $SECONDS seconds after resolving the partition" >> "$RESULT_FILE" + + last="" + while [[ -z $mean_confirmation_ms || $mean_confirmation_ms -gt $target ]]; do + sleep 5 + + if [[ -n $mean_confirmation_ms && -n $last && $mean_confirmation_ms -gt $(echo "$last * 1.2" | bc) || $SECONDS -gt $PARTITION_DURATION ]]; then + echo " Unable to make progress after $SECONDS seconds. Last confirmation time was $mean_confirmation_ms ms" >> "$RESULT_FILE" + exit 0 + fi + last=$mean_confirmation_ms + get_validator_confirmation_time 10 + done + + echo " Recovered in $SECONDS seconds: validator confirmation to fall to $mean_confirmation_ms ms" >> "$RESULT_FILE" + + PARTITION_DURATION=$(( PARTITION_DURATION + PARTITION_INCREMENT )) +done diff --git a/system-test/testnet-automation-json-parser.py b/system-test/testnet-automation-json-parser.py index 10f82b13e6..37959b0258 100755 --- a/system-test/testnet-automation-json-parser.py +++ b/system-test/testnet-automation-json-parser.py @@ -1,5 +1,9 @@ #!/usr/bin/env python3 -import sys, json +import sys, json, argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--empty_error", action="store_true", help="If present, do not print error message") +args = parser.parse_args() data=json.load(sys.stdin) @@ -7,7 +11,7 @@ if 'results' in data: for result in data['results']: if 'series' in result: print(result['series'][0]['columns'][1] + ': ' + str(result['series'][0]['values'][0][1])) - else: + elif not args.empty_error: print("An expected result from CURL request is missing") -else: +elif not args.empty_error: print("No results returned from CURL request")