solana/system-test/testnet-automation.sh

381 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
set -e
# shellcheck disable=SC1090
# shellcheck disable=SC1091
source "$(dirname "$0")"/automation_utils.sh
function cleanup_testnet {
RC=$?
if [[ $RC != 0 ]]; then
RESULT_DETAILS="
Test failed during step:
${STEP}
Failure occured when running the following command:
$*"
fi
# shellcheck disable=SC2034
TESTNET_FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
if [[ "$UPLOAD_RESULTS_TO_SLACK" = "true" ]]; then
upload_results_to_slack
fi
if [[ "$UPLOAD_RESULTS_TO_DISCORD" = "true" ]]; then
upload_results_to_discord
fi
(
execution_step "Collecting Logfiles from Nodes"
collect_logs
) || echo "Error from collecting logs"
(
execution_step "Stop Network Software"
"${REPO_ROOT}"/net/net.sh stop
) || echo "Error from stopping nodes"
(
analyze_packet_loss
) || echo "Error from packet loss analysis"
execution_step "Deleting Testnet"
if test -f "${REPO_ROOT}"/net/"${CLOUD_PROVIDER}".sh; then
"${REPO_ROOT}"/net/"${CLOUD_PROVIDER}".sh delete -p "${TESTNET_TAG}"
fi
}
trap 'cleanup_testnet $BASH_COMMAND' EXIT
function launch_testnet() {
set -x
# shellcheck disable=SC2068
execution_step "Create ${NUMBER_OF_VALIDATOR_NODES} ${CLOUD_PROVIDER} nodes"
case $CLOUD_PROVIDER in
gce)
if [[ -z $VALIDATOR_NODE_MACHINE_TYPE ]]; then
echo VALIDATOR_NODE_MACHINE_TYPE not defined
exit 1
fi
# shellcheck disable=SC2068
# shellcheck disable=SC2086
"${REPO_ROOT}"/net/gce.sh create \
-d pd-ssd \
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
--self-destruct-hours 0 \
${ADDITIONAL_FLAGS[@]/#/" "}
;;
ec2)
# shellcheck disable=SC2068
# shellcheck disable=SC2086
"${REPO_ROOT}"/net/ec2.sh create \
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
${ADDITIONAL_FLAGS[@]/#/" "}
;;
azure)
# shellcheck disable=SC2068
# shellcheck disable=SC2086
"${REPO_ROOT}"/net/azure.sh create \
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
${ADDITIONAL_FLAGS[@]/#/" "}
;;
colo)
"${REPO_ROOT}"/net/colo.sh delete --reclaim-preemptible-reservations
# shellcheck disable=SC2068
# shellcheck disable=SC2086
"${REPO_ROOT}"/net/colo.sh create \
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" $maybeEnableGpu \
-p "$TESTNET_TAG" $maybePublicIpAddresses --dedicated \
${ADDITIONAL_FLAGS[@]/#/" "}
;;
bare)
;;
*)
echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER"
;;
esac
execution_step "Configure database"
"${REPO_ROOT}"/net/init-metrics.sh -e
execution_step "Fetch reusable testnet keypairs"
if [[ ! -d "${REPO_ROOT}"/net/keypairs ]]; then
# git clone https://github.com/solana-labs/testnet-keypairs.git "${REPO_ROOT}"/net/keypairs
git clone git@github.com:solana-labs/testnet-keypairs.git "${REPO_ROOT}"/net/keypairs
# If we have provider-specific keys (CoLo*, GCE*, etc) use them instead of generic val*
if [[ -d "${REPO_ROOT}"/net/keypairs/"${CLOUD_PROVIDER}" ]]; then
cp "${REPO_ROOT}"/net/keypairs/"${CLOUD_PROVIDER}"/* "${REPO_ROOT}"/net/keypairs/
fi
fi
if [[ "$CLOUD_PROVIDER" = "colo" ]]; then
execution_step "Stopping Colo nodes before we start"
"${REPO_ROOT}"/net/net.sh stop
fi
execution_step "Starting bootstrap node and ${NUMBER_OF_VALIDATOR_NODES} validator nodes"
declare -g version_args
get_net_launch_software_version_launch_args "$CHANNEL" "solana-release" version_args
declare maybeWarpSlot
if [[ -n "$WARP_SLOT" ]]; then
maybeWarpSlot="--warp-slot $WARP_SLOT"
fi
declare maybeAsyncNodeInit
if [[ "$ASYNC_NODE_INIT" = "true" ]]; then
maybeAsyncNodeInit="--async-node-init"
fi
declare maybeExtraPrimordialStakes
if [[ -n "$EXTRA_PRIMORDIAL_STAKES" ]]; then
maybeExtraPrimordialStakes="--extra-primordial-stakes $EXTRA_PRIMORDIAL_STAKES"
fi
# shellcheck disable=SC2068
# shellcheck disable=SC2086
"${REPO_ROOT}"/net/net.sh start $version_args \
-c idle=$NUMBER_OF_CLIENT_NODES $maybeStartAllowBootFailures \
--gpu-mode $startGpuMode $maybeWarpSlot $maybeAsyncNodeInit \
$maybeExtraPrimordialStakes
if [[ -n "$WAIT_FOR_EQUAL_STAKE" ]]; then
wait_for_equal_stake
else
execution_step "Waiting for bootstrap validator's stake to fall below ${BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD}%"
wait_for_max_stake "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD"
fi
echo "NUMBER_OF_CLIENT_NODES is : &NUMBER_OF_CLIENT_NODES"
if [[ $NUMBER_OF_CLIENT_NODES -gt 0 ]]; then
execution_step "Starting ${NUMBER_OF_CLIENT_NODES} client nodes"
"${REPO_ROOT}"/net/net.sh startclients "$maybeClientOptions" "$CLIENT_OPTIONS"
# It takes roughly 3 minutes from the time the client nodes return from starting to when they have finished loading the
# accounts file and actually start sending transactions
sleep 180
fi
if [[ -n "$WARMUP_SLOTS_BEFORE_TEST" ]]; then
# Allow the network to run for a bit before beginning the test
while [[ "$WARMUP_SLOTS_BEFORE_TEST" -gt $(get_slot) ]]; do
sleep 5
done
fi
# Stop the specified number of nodes
num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 ))
if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then
execution_step "Stopping $NUMBER_OF_OFFLINE_NODES nodes"
for (( i=NUMBER_OF_VALIDATOR_NODES; i>$(( NUMBER_OF_VALIDATOR_NODES - NUMBER_OF_OFFLINE_NODES )); i-- )); do
# shellcheck disable=SC2154
"${REPO_ROOT}"/net/net.sh stopnode -i "${validatorIpList[$i]}"
done
num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES ))
fi
SECONDS=0
START_SLOT=$(get_slot)
SLOT_COUNT_START_SECONDS=$SECONDS
execution_step "Marking beginning of slot rate test - Slot: $START_SLOT, Seconds: $SLOT_COUNT_START_SECONDS"
case $TEST_TYPE in
fixed_duration)
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
sleep "$TEST_DURATION_SECONDS"
;;
partition)
STATS_START_SECONDS=$SECONDS
execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
sleep "$PARTITION_INACTIVE_DURATION"
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes
sleep "$PARTITION_ACTIVE_DURATION"
execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes
sleep "$PARTITION_INACTIVE_DURATION"
done
STATS_FINISH_SECONDS=$SECONDS
TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
;;
script)
execution_step "Running custom script: ${REPO_ROOT}/${CUSTOM_SCRIPT}"
"$REPO_ROOT"/"$CUSTOM_SCRIPT" "$RESULT_FILE"
;;
*)
echo "Error: Unsupported test type: $TEST_TYPE"
;;
esac
END_SLOT=$(get_slot)
SLOT_COUNT_END_SECONDS=$SECONDS
execution_step "Marking end of slot rate test - Slot: $END_SLOT, Seconds: $SLOT_COUNT_END_SECONDS"
SLOTS_PER_SECOND="$(bc <<< "scale=3; ($END_SLOT - $START_SLOT)/($SLOT_COUNT_END_SECONDS - $SLOT_COUNT_START_SECONDS)")"
execution_step "Average slot rate: $SLOTS_PER_SECOND slots/second over $((SLOT_COUNT_END_SECONDS - SLOT_COUNT_START_SECONDS)) seconds"
if [[ "$SKIP_PERF_RESULTS" = "false" ]]; then
declare -g dropped_vote_hash_count
collect_performance_statistics
echo "slots_per_second: $SLOTS_PER_SECOND" >>"$RESULT_FILE"
if [[ $dropped_vote_hash_count -gt 0 ]]; then
execution_step "Checking for dropped vote hash count"
exit 1
fi
fi
RESULT_DETAILS=$(<"$RESULT_FILE")
upload-ci-artifact "$RESULT_FILE"
}
# shellcheck disable=SC2034
RESULT_DETAILS=
STEP=
execution_step "Initialize Environment"
[[ -n $TESTNET_TAG ]] || TESTNET_TAG=${CLOUD_PROVIDER}-testnet-automation
[[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://internal-metrics.solana.com:8086
[[ -n $BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD ]] || BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD=66
[[ -n $SKIP_PERF_RESULTS ]] || SKIP_PERF_RESULTS=false
if [[ -z $NUMBER_OF_VALIDATOR_NODES ]]; then
echo NUMBER_OF_VALIDATOR_NODES not defined
exit 1
fi
startGpuMode="off"
if [[ -z $ENABLE_GPU ]]; then
ENABLE_GPU=false
fi
if [[ "$ENABLE_GPU" = "true" ]]; then
maybeEnableGpu="--enable-gpu"
startGpuMode="on"
fi
if [[ -z $NUMBER_OF_CLIENT_NODES ]]; then
echo NUMBER_OF_CLIENT_NODES not defined
exit 1
fi
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
if [[ -z $SOLANA_METRICS_PARTIAL_CONFIG ]]; then
echo SOLANA_METRICS_PARTIAL_CONFIG not defined
exit 1
fi
export SOLANA_METRICS_CONFIG="db=$TESTNET_TAG,host=$INFLUX_HOST,$SOLANA_METRICS_PARTIAL_CONFIG"
fi
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
if [[ -z $ALLOW_BOOT_FAILURES ]]; then
ALLOW_BOOT_FAILURES=false
fi
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]]; then
maybeCreateAllowBootFailures="--allow-boot-failures"
maybeStartAllowBootFailures="-F"
fi
if [[ -z $USE_PUBLIC_IP_ADDRESSES ]]; then
USE_PUBLIC_IP_ADDRESSES=false
fi
if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]]; then
maybePublicIpAddresses="-P"
fi
execution_step "Checking for required parameters"
testTypeRequiredParameters=
case $TEST_TYPE in
fixed_duration)
testTypeRequiredParameters=(
TEST_DURATION_SECONDS \
)
;;
partition)
testTypeRequiredParameters=(
NETEM_CONFIG_FILE \
PARTITION_ACTIVE_DURATION \
PARTITION_INACTIVE_DURATION \
PARTITION_ITERATION_COUNT \
)
;;
script)
testTypeRequiredParameters=(
CUSTOM_SCRIPT \
)
;;
*)
echo "Error: Unsupported test type: $TEST_TYPE"
;;
esac
missingParameters=
for i in "${testTypeRequiredParameters[@]}"; do
if [[ -z ${!i} ]]; then
missingParameters+="${i}, "
fi
done
if [[ -n $missingParameters ]]; then
echo "Error: For test type $TEST_TYPE, the following required parameters are missing: ${missingParameters[*]}"
exit 1
fi
maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
maybeCustomMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"--custom-machine-type"}
IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
RESULT_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
rm -f "$RESULT_FILE"
TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
NUMBER_OF_VALIDATOR_NODES \
ENABLE_GPU \
VALIDATOR_NODE_MACHINE_TYPE \
NUMBER_OF_CLIENT_NODES \
CLIENT_OPTIONS \
CLIENT_DELAY_START \
TESTNET_ZONES \
TEST_DURATION_SECONDS \
USE_PUBLIC_IP_ADDRESSES \
ALLOW_BOOT_FAILURES \
ADDITIONAL_FLAGS \
APPLY_PARTITIONS \
NETEM_CONFIG_FILE \
WAIT_FOR_EQUAL_STAKE \
WARMUP_SLOTS_BEFORE_TEST \
NUMBER_OF_OFFLINE_NODES \
PARTITION_ACTIVE_DURATION \
PARTITION_INACTIVE_DURATION \
PARTITION_ITERATION_COUNT \
TEST_TYPE \
CUSTOM_SCRIPT \
)
TEST_CONFIGURATION=
for i in "${TEST_PARAMS_TO_DISPLAY[@]}"; do
if [[ -n ${!i} ]]; then
TEST_CONFIGURATION+="${i} = ${!i} | "
fi
done
# shellcheck disable=SC2034
TESTNET_START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
launch_testnet