Add system test to measure recovery after partition (#20902)
* Add system test to measure recovery after partition * shellcheck * increase partition length until failure * adjust parameters and output * different stopping condition
This commit is contained in:
parent
2df4e7eea3
commit
c56fb0f014
|
@ -110,6 +110,21 @@ function get_current_stake {
|
|||
'$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalCurrentStake\": [0-9]*" | cut -d: -f2'
|
||||
}
|
||||
|
||||
function get_validator_confirmation_time {
|
||||
SINCE=$1
|
||||
declare q_mean_confirmation='
|
||||
SELECT ROUND(MEAN("duration_ms")) as "mean_confirmation_ms"
|
||||
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||
WHERE time > now() - '"$SINCE"'s'
|
||||
|
||||
mean_confirmation_ms=$( \
|
||||
curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
|
||||
--data-urlencode "db=${TESTNET_TAG}" \
|
||||
--data-urlencode "q=$q_mean_confirmation" |
|
||||
python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py --empty_error |
|
||||
cut -d' ' -f2)
|
||||
}
|
||||
|
||||
function collect_performance_statistics {
|
||||
execution_step "Collect performance statistics about run"
|
||||
declare q_mean_tps='
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
steps:
|
||||
- command: "system-test/testnet-automation.sh"
|
||||
label: "Partition recovery on GCE"
|
||||
env:
|
||||
UPLOAD_RESULTS_TO_SLACK: "true"
|
||||
CLOUD_PROVIDER: "gce"
|
||||
ENABLE_GPU: "false"
|
||||
NUMBER_OF_VALIDATOR_NODES: 9
|
||||
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
|
||||
NUMBER_OF_CLIENT_NODES: 1
|
||||
ADDITIONAL_FLAGS: "--dedicated"
|
||||
SKIP_PERF_RESULTS: "true"
|
||||
EXTRA_PRIMORDIAL_STAKES: 4
|
||||
TEST_TYPE: "script"
|
||||
WARMUP_SLOTS_BEFORE_TEST: 400
|
||||
PRE_PARTITION_DURATION: 120
|
||||
PARTITION_DURATION: 360
|
||||
PARTITION_INCREMENT: 60
|
||||
NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions"
|
||||
CUSTOM_SCRIPT: "system-test/partition-testcases/measure-partition-recovery.sh"
|
||||
agents:
|
||||
- "queue=gce-deploy"
|
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# shellcheck disable=SC1090
|
||||
# shellcheck disable=SC1091
|
||||
source "$(dirname "$0")"/../automation_utils.sh
|
||||
|
||||
RESULT_FILE="$1"
|
||||
|
||||
[[ -n $TESTNET_TAG ]] || TESTNET_TAG=${CLOUD_PROVIDER}-testnet-automation
|
||||
|
||||
if [[ -z $NETEM_CONFIG_FILE ]]; then
|
||||
echo "Error: For this test NETEM_CONFIG_FILE must be specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z $PRE_PARTITION_DURATION ]]; then
|
||||
PRE_PARTITION_DURATION=60
|
||||
fi
|
||||
|
||||
if [[ -z $PARTITION_DURATION ]]; then
|
||||
PARTITION_DURATION=300
|
||||
fi
|
||||
|
||||
if [[ -z $PARTITION_INCREMENT ]]; then
|
||||
PARTITION_INCREMENT=60
|
||||
fi
|
||||
|
||||
num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 ))
|
||||
if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then
|
||||
num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES ))
|
||||
fi
|
||||
|
||||
execution_step "Measuring validator confirmation time for $PRE_PARTITION_DURATION seconds"
|
||||
sleep "$PRE_PARTITION_DURATION"
|
||||
get_validator_confirmation_time "$PRE_PARTITION_DURATION"
|
||||
# shellcheck disable=SC2154
|
||||
execution_step "Pre partition validator confirmation time is $mean_confirmation_ms ms"
|
||||
echo "Pre partition validator confirmation time: $mean_confirmation_ms ms" >> "$RESULT_FILE"
|
||||
target=$mean_confirmation_ms
|
||||
|
||||
while true; do
|
||||
execution_step "Applying partition config $NETEM_CONFIG_FILE for $PARTITION_DURATION seconds"
|
||||
echo "Partitioning for $PARTITION_DURATION seconds" >> "$RESULT_FILE"
|
||||
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes
|
||||
sleep "$PARTITION_DURATION"
|
||||
|
||||
execution_step "Resolving partition"
|
||||
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes
|
||||
|
||||
get_validator_confirmation_time 10
|
||||
SECONDS=0
|
||||
|
||||
# This happens when we haven't confirmed anything recently so the query returns an empty string
|
||||
while [[ -z $mean_confirmation_ms ]]; do
|
||||
sleep 5
|
||||
get_validator_confirmation_time 10
|
||||
if [[ $SECONDS -gt $PARTITION_DURATION ]]; then
|
||||
echo " No confirmations seen after $SECONDS seconds" >> "$RESULT_FILE"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
echo " Validator confirmation is $mean_confirmation_ms ms $SECONDS seconds after resolving the partition" >> "$RESULT_FILE"
|
||||
|
||||
last=""
|
||||
while [[ -z $mean_confirmation_ms || $mean_confirmation_ms -gt $target ]]; do
|
||||
sleep 5
|
||||
|
||||
if [[ -n $mean_confirmation_ms && -n $last && $mean_confirmation_ms -gt $(echo "$last * 1.2" | bc) || $SECONDS -gt $PARTITION_DURATION ]]; then
|
||||
echo " Unable to make progress after $SECONDS seconds. Last confirmation time was $mean_confirmation_ms ms" >> "$RESULT_FILE"
|
||||
exit 0
|
||||
fi
|
||||
last=$mean_confirmation_ms
|
||||
get_validator_confirmation_time 10
|
||||
done
|
||||
|
||||
echo " Recovered in $SECONDS seconds: validator confirmation to fall to $mean_confirmation_ms ms" >> "$RESULT_FILE"
|
||||
|
||||
PARTITION_DURATION=$(( PARTITION_DURATION + PARTITION_INCREMENT ))
|
||||
done
|
|
@ -1,5 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys, json
|
||||
import sys, json, argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--empty_error", action="store_true", help="If present, do not print error message")
|
||||
args = parser.parse_args()
|
||||
|
||||
data=json.load(sys.stdin)
|
||||
|
||||
|
@ -7,7 +11,7 @@ if 'results' in data:
|
|||
for result in data['results']:
|
||||
if 'series' in result:
|
||||
print(result['series'][0]['columns'][1] + ': ' + str(result['series'][0]['values'][0][1]))
|
||||
else:
|
||||
elif not args.empty_error:
|
||||
print("An expected result from CURL request is missing")
|
||||
else:
|
||||
elif not args.empty_error:
|
||||
print("No results returned from CURL request")
|
||||
|
|
Loading…
Reference in New Issue