381 lines
12 KiB
Bash
Executable File
381 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -e
|
|
|
|
# shellcheck disable=SC1090
|
|
# shellcheck disable=SC1091
|
|
source "$(dirname "$0")"/automation_utils.sh
|
|
|
|
function cleanup_testnet {
|
|
RC=$?
|
|
if [[ $RC != 0 ]]; then
|
|
RESULT_DETAILS="
|
|
Test failed during step:
|
|
${STEP}
|
|
|
|
Failure occured when running the following command:
|
|
$*"
|
|
fi
|
|
|
|
# shellcheck disable=SC2034
|
|
TESTNET_FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
|
|
if [[ "$UPLOAD_RESULTS_TO_SLACK" = "true" ]]; then
|
|
upload_results_to_slack
|
|
fi
|
|
|
|
if [[ "$UPLOAD_RESULTS_TO_DISCORD" = "true" ]]; then
|
|
upload_results_to_discord
|
|
fi
|
|
|
|
(
|
|
execution_step "Collecting Logfiles from Nodes"
|
|
collect_logs
|
|
) || echo "Error from collecting logs"
|
|
|
|
(
|
|
execution_step "Stop Network Software"
|
|
"${REPO_ROOT}"/net/net.sh stop
|
|
) || echo "Error from stopping nodes"
|
|
|
|
(
|
|
analyze_packet_loss
|
|
) || echo "Error from packet loss analysis"
|
|
|
|
execution_step "Deleting Testnet"
|
|
if test -f "${REPO_ROOT}"/net/"${CLOUD_PROVIDER}".sh; then
|
|
"${REPO_ROOT}"/net/"${CLOUD_PROVIDER}".sh delete -p "${TESTNET_TAG}"
|
|
fi
|
|
}
|
|
trap 'cleanup_testnet $BASH_COMMAND' EXIT
|
|
|
|
function launch_testnet() {
|
|
set -x
|
|
|
|
# shellcheck disable=SC2068
|
|
execution_step "Create ${NUMBER_OF_VALIDATOR_NODES} ${CLOUD_PROVIDER} nodes"
|
|
|
|
case $CLOUD_PROVIDER in
|
|
gce)
|
|
if [[ -z $VALIDATOR_NODE_MACHINE_TYPE ]]; then
|
|
echo VALIDATOR_NODE_MACHINE_TYPE not defined
|
|
exit 1
|
|
fi
|
|
# shellcheck disable=SC2068
|
|
# shellcheck disable=SC2086
|
|
"${REPO_ROOT}"/net/gce.sh create \
|
|
-d pd-ssd \
|
|
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
|
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
|
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
|
|
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
|
--self-destruct-hours 0 \
|
|
${ADDITIONAL_FLAGS[@]/#/" "}
|
|
;;
|
|
ec2)
|
|
# shellcheck disable=SC2068
|
|
# shellcheck disable=SC2086
|
|
"${REPO_ROOT}"/net/ec2.sh create \
|
|
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
|
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
|
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
|
|
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
|
${ADDITIONAL_FLAGS[@]/#/" "}
|
|
;;
|
|
azure)
|
|
# shellcheck disable=SC2068
|
|
# shellcheck disable=SC2086
|
|
"${REPO_ROOT}"/net/azure.sh create \
|
|
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
|
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
|
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
|
|
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
|
${ADDITIONAL_FLAGS[@]/#/" "}
|
|
;;
|
|
colo)
|
|
"${REPO_ROOT}"/net/colo.sh delete --reclaim-preemptible-reservations
|
|
# shellcheck disable=SC2068
|
|
# shellcheck disable=SC2086
|
|
"${REPO_ROOT}"/net/colo.sh create \
|
|
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" $maybeEnableGpu \
|
|
-p "$TESTNET_TAG" $maybePublicIpAddresses --dedicated \
|
|
${ADDITIONAL_FLAGS[@]/#/" "}
|
|
;;
|
|
bare)
|
|
;;
|
|
*)
|
|
echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER"
|
|
;;
|
|
esac
|
|
|
|
execution_step "Configure database"
|
|
"${REPO_ROOT}"/net/init-metrics.sh -e
|
|
|
|
execution_step "Fetch reusable testnet keypairs"
|
|
if [[ ! -d "${REPO_ROOT}"/net/keypairs ]]; then
|
|
# git clone https://github.com/solana-labs/testnet-keypairs.git "${REPO_ROOT}"/net/keypairs
|
|
git clone git@github.com:solana-labs/testnet-keypairs.git "${REPO_ROOT}"/net/keypairs
|
|
# If we have provider-specific keys (CoLo*, GCE*, etc) use them instead of generic val*
|
|
if [[ -d "${REPO_ROOT}"/net/keypairs/"${CLOUD_PROVIDER}" ]]; then
|
|
cp "${REPO_ROOT}"/net/keypairs/"${CLOUD_PROVIDER}"/* "${REPO_ROOT}"/net/keypairs/
|
|
fi
|
|
fi
|
|
|
|
if [[ "$CLOUD_PROVIDER" = "colo" ]]; then
|
|
execution_step "Stopping Colo nodes before we start"
|
|
"${REPO_ROOT}"/net/net.sh stop
|
|
fi
|
|
|
|
execution_step "Starting bootstrap node and ${NUMBER_OF_VALIDATOR_NODES} validator nodes"
|
|
|
|
declare -g version_args
|
|
get_net_launch_software_version_launch_args "$CHANNEL" "solana-release" version_args
|
|
|
|
declare maybeWarpSlot
|
|
if [[ -n "$WARP_SLOT" ]]; then
|
|
maybeWarpSlot="--warp-slot $WARP_SLOT"
|
|
fi
|
|
|
|
declare maybeAsyncNodeInit
|
|
if [[ "$ASYNC_NODE_INIT" = "true" ]]; then
|
|
maybeAsyncNodeInit="--async-node-init"
|
|
fi
|
|
|
|
declare maybeExtraPrimordialStakes
|
|
if [[ -n "$EXTRA_PRIMORDIAL_STAKES" ]]; then
|
|
maybeExtraPrimordialStakes="--extra-primordial-stakes $EXTRA_PRIMORDIAL_STAKES"
|
|
fi
|
|
|
|
# shellcheck disable=SC2068
|
|
# shellcheck disable=SC2086
|
|
"${REPO_ROOT}"/net/net.sh start $version_args \
|
|
-c idle=$NUMBER_OF_CLIENT_NODES $maybeStartAllowBootFailures \
|
|
--gpu-mode $startGpuMode $maybeWarpSlot $maybeAsyncNodeInit \
|
|
$maybeExtraPrimordialStakes
|
|
|
|
if [[ -n "$WAIT_FOR_EQUAL_STAKE" ]]; then
|
|
wait_for_equal_stake
|
|
else
|
|
execution_step "Waiting for bootstrap validator's stake to fall below ${BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD}%"
|
|
wait_for_max_stake "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD"
|
|
fi
|
|
|
|
echo "NUMBER_OF_CLIENT_NODES is : &NUMBER_OF_CLIENT_NODES"
|
|
if [[ $NUMBER_OF_CLIENT_NODES -gt 0 ]]; then
|
|
execution_step "Starting ${NUMBER_OF_CLIENT_NODES} client nodes"
|
|
"${REPO_ROOT}"/net/net.sh startclients "$maybeClientOptions" "$CLIENT_OPTIONS"
|
|
# It takes roughly 3 minutes from the time the client nodes return from starting to when they have finished loading the
|
|
# accounts file and actually start sending transactions
|
|
sleep 180
|
|
fi
|
|
|
|
if [[ -n "$WARMUP_SLOTS_BEFORE_TEST" ]]; then
|
|
# Allow the network to run for a bit before beginning the test
|
|
while [[ "$WARMUP_SLOTS_BEFORE_TEST" -gt $(get_slot) ]]; do
|
|
sleep 5
|
|
done
|
|
fi
|
|
|
|
# Stop the specified number of nodes
|
|
num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 ))
|
|
if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then
|
|
execution_step "Stopping $NUMBER_OF_OFFLINE_NODES nodes"
|
|
for (( i=NUMBER_OF_VALIDATOR_NODES; i>$(( NUMBER_OF_VALIDATOR_NODES - NUMBER_OF_OFFLINE_NODES )); i-- )); do
|
|
# shellcheck disable=SC2154
|
|
"${REPO_ROOT}"/net/net.sh stopnode -i "${validatorIpList[$i]}"
|
|
done
|
|
num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES ))
|
|
fi
|
|
|
|
SECONDS=0
|
|
START_SLOT=$(get_slot)
|
|
SLOT_COUNT_START_SECONDS=$SECONDS
|
|
execution_step "Marking beginning of slot rate test - Slot: $START_SLOT, Seconds: $SLOT_COUNT_START_SECONDS"
|
|
|
|
case $TEST_TYPE in
|
|
fixed_duration)
|
|
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
|
|
sleep "$TEST_DURATION_SECONDS"
|
|
;;
|
|
partition)
|
|
STATS_START_SECONDS=$SECONDS
|
|
execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
|
|
sleep "$PARTITION_INACTIVE_DURATION"
|
|
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
|
|
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
|
|
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
|
|
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes
|
|
sleep "$PARTITION_ACTIVE_DURATION"
|
|
|
|
execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
|
|
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes
|
|
sleep "$PARTITION_INACTIVE_DURATION"
|
|
done
|
|
STATS_FINISH_SECONDS=$SECONDS
|
|
TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
|
|
;;
|
|
script)
|
|
execution_step "Running custom script: ${REPO_ROOT}/${CUSTOM_SCRIPT}"
|
|
"$REPO_ROOT"/"$CUSTOM_SCRIPT" "$RESULT_FILE"
|
|
;;
|
|
*)
|
|
echo "Error: Unsupported test type: $TEST_TYPE"
|
|
;;
|
|
esac
|
|
|
|
END_SLOT=$(get_slot)
|
|
SLOT_COUNT_END_SECONDS=$SECONDS
|
|
execution_step "Marking end of slot rate test - Slot: $END_SLOT, Seconds: $SLOT_COUNT_END_SECONDS"
|
|
|
|
SLOTS_PER_SECOND="$(bc <<< "scale=3; ($END_SLOT - $START_SLOT)/($SLOT_COUNT_END_SECONDS - $SLOT_COUNT_START_SECONDS)")"
|
|
execution_step "Average slot rate: $SLOTS_PER_SECOND slots/second over $((SLOT_COUNT_END_SECONDS - SLOT_COUNT_START_SECONDS)) seconds"
|
|
|
|
if [[ "$SKIP_PERF_RESULTS" = "false" ]]; then
|
|
declare -g dropped_vote_hash_count
|
|
|
|
collect_performance_statistics
|
|
echo "slots_per_second: $SLOTS_PER_SECOND" >>"$RESULT_FILE"
|
|
|
|
if [[ $dropped_vote_hash_count -gt 0 ]]; then
|
|
execution_step "Checking for dropped vote hash count"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
RESULT_DETAILS=$(<"$RESULT_FILE")
|
|
upload-ci-artifact "$RESULT_FILE"
|
|
}
|
|
|
|
# shellcheck disable=SC2034
|
|
RESULT_DETAILS=
|
|
STEP=
|
|
execution_step "Initialize Environment"
|
|
|
|
[[ -n $TESTNET_TAG ]] || TESTNET_TAG=${CLOUD_PROVIDER}-testnet-automation
|
|
[[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://internal-metrics.solana.com:8086
|
|
[[ -n $BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD ]] || BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD=66
|
|
[[ -n $SKIP_PERF_RESULTS ]] || SKIP_PERF_RESULTS=false
|
|
|
|
if [[ -z $NUMBER_OF_VALIDATOR_NODES ]]; then
|
|
echo NUMBER_OF_VALIDATOR_NODES not defined
|
|
exit 1
|
|
fi
|
|
|
|
startGpuMode="off"
|
|
if [[ -z $ENABLE_GPU ]]; then
|
|
ENABLE_GPU=false
|
|
fi
|
|
if [[ "$ENABLE_GPU" = "true" ]]; then
|
|
maybeEnableGpu="--enable-gpu"
|
|
startGpuMode="on"
|
|
fi
|
|
|
|
if [[ -z $NUMBER_OF_CLIENT_NODES ]]; then
|
|
echo NUMBER_OF_CLIENT_NODES not defined
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
|
|
if [[ -z $SOLANA_METRICS_PARTIAL_CONFIG ]]; then
|
|
echo SOLANA_METRICS_PARTIAL_CONFIG not defined
|
|
exit 1
|
|
fi
|
|
export SOLANA_METRICS_CONFIG="db=$TESTNET_TAG,host=$INFLUX_HOST,$SOLANA_METRICS_PARTIAL_CONFIG"
|
|
fi
|
|
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
|
|
|
|
if [[ -z $ALLOW_BOOT_FAILURES ]]; then
|
|
ALLOW_BOOT_FAILURES=false
|
|
fi
|
|
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]]; then
|
|
maybeCreateAllowBootFailures="--allow-boot-failures"
|
|
maybeStartAllowBootFailures="-F"
|
|
fi
|
|
|
|
if [[ -z $USE_PUBLIC_IP_ADDRESSES ]]; then
|
|
USE_PUBLIC_IP_ADDRESSES=false
|
|
fi
|
|
if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]]; then
|
|
maybePublicIpAddresses="-P"
|
|
fi
|
|
|
|
execution_step "Checking for required parameters"
|
|
testTypeRequiredParameters=
|
|
case $TEST_TYPE in
|
|
fixed_duration)
|
|
testTypeRequiredParameters=(
|
|
TEST_DURATION_SECONDS \
|
|
)
|
|
;;
|
|
partition)
|
|
testTypeRequiredParameters=(
|
|
NETEM_CONFIG_FILE \
|
|
PARTITION_ACTIVE_DURATION \
|
|
PARTITION_INACTIVE_DURATION \
|
|
PARTITION_ITERATION_COUNT \
|
|
)
|
|
;;
|
|
script)
|
|
testTypeRequiredParameters=(
|
|
CUSTOM_SCRIPT \
|
|
)
|
|
;;
|
|
*)
|
|
echo "Error: Unsupported test type: $TEST_TYPE"
|
|
;;
|
|
esac
|
|
|
|
missingParameters=
|
|
for i in "${testTypeRequiredParameters[@]}"; do
|
|
if [[ -z ${!i} ]]; then
|
|
missingParameters+="${i}, "
|
|
fi
|
|
done
|
|
|
|
if [[ -n $missingParameters ]]; then
|
|
echo "Error: For test type $TEST_TYPE, the following required parameters are missing: ${missingParameters[*]}"
|
|
exit 1
|
|
fi
|
|
|
|
maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
|
|
maybeCustomMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"--custom-machine-type"}
|
|
|
|
IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
|
|
|
|
RESULT_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
|
|
rm -f "$RESULT_FILE"
|
|
|
|
TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
|
|
NUMBER_OF_VALIDATOR_NODES \
|
|
ENABLE_GPU \
|
|
VALIDATOR_NODE_MACHINE_TYPE \
|
|
NUMBER_OF_CLIENT_NODES \
|
|
CLIENT_OPTIONS \
|
|
CLIENT_DELAY_START \
|
|
TESTNET_ZONES \
|
|
TEST_DURATION_SECONDS \
|
|
USE_PUBLIC_IP_ADDRESSES \
|
|
ALLOW_BOOT_FAILURES \
|
|
ADDITIONAL_FLAGS \
|
|
APPLY_PARTITIONS \
|
|
NETEM_CONFIG_FILE \
|
|
WAIT_FOR_EQUAL_STAKE \
|
|
WARMUP_SLOTS_BEFORE_TEST \
|
|
NUMBER_OF_OFFLINE_NODES \
|
|
PARTITION_ACTIVE_DURATION \
|
|
PARTITION_INACTIVE_DURATION \
|
|
PARTITION_ITERATION_COUNT \
|
|
TEST_TYPE \
|
|
CUSTOM_SCRIPT \
|
|
)
|
|
|
|
TEST_CONFIGURATION=
|
|
for i in "${TEST_PARAMS_TO_DISPLAY[@]}"; do
|
|
if [[ -n ${!i} ]]; then
|
|
TEST_CONFIGURATION+="${i} = ${!i} | "
|
|
fi
|
|
done
|
|
|
|
# shellcheck disable=SC2034
|
|
TESTNET_START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
|
|
|
|
launch_testnet
|