Implement nightly performance tests (#6140)
* Implement nightly performance tests on colo
This commit is contained in:
parent
10cf728e11
commit
60e8cf5a47
|
@ -1,13 +0,0 @@
|
||||||
steps:
|
|
||||||
- command: "ci/testnet-automation.sh"
|
|
||||||
label: "run testnet"
|
|
||||||
agents:
|
|
||||||
- "queue=testnet-deploy"
|
|
||||||
|
|
||||||
- wait: ~
|
|
||||||
continue_on_failure: true
|
|
||||||
|
|
||||||
- command: "ci/testnet-automation-cleanup.sh"
|
|
||||||
label: "delete testnet"
|
|
||||||
agents:
|
|
||||||
- "queue=testnet-deploy"
|
|
|
@ -1,10 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
cd "$(dirname "$0")/.."
|
|
||||||
|
|
||||||
echo --- find testnet configuration
|
|
||||||
net/gce.sh config -p testnet-automation
|
|
||||||
|
|
||||||
echo --- delete testnet
|
|
||||||
net/gce.sh delete -p testnet-automation
|
|
|
@ -1,96 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
cd "$(dirname "$0")/.."
|
|
||||||
|
|
||||||
if [[ -z $USE_PREBUILT_CHANNEL_TARBALL ]]; then
|
|
||||||
echo --- downloading tar from build artifacts
|
|
||||||
buildkite-agent artifact download "solana-release*.tar.bz2" .
|
|
||||||
fi
|
|
||||||
|
|
||||||
# shellcheck disable=SC1091
|
|
||||||
source ci/upload-ci-artifact.sh
|
|
||||||
|
|
||||||
[[ -n $ITERATION_WAIT ]] || ITERATION_WAIT=300
|
|
||||||
[[ -n $NUMBER_OF_NODES ]] || NUMBER_OF_NODES="10 25 50 100"
|
|
||||||
[[ -n $LEADER_CPU_MACHINE_TYPE ]] ||
|
|
||||||
LEADER_CPU_MACHINE_TYPE="--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
|
|
||||||
[[ -n $CLIENT_COUNT ]] || CLIENT_COUNT=2
|
|
||||||
[[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation
|
|
||||||
[[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-b"
|
|
||||||
[[ -n $CHANNEL ]] || CHANNEL=beta
|
|
||||||
[[ -n $ADDITIONAL_FLAGS ]] || ADDITIONAL_FLAGS=""
|
|
||||||
|
|
||||||
TESTNET_CLOUD_ZONES=(); while read -r -d, ; do TESTNET_CLOUD_ZONES+=( "$REPLY" ); done <<< "${TESTNET_ZONES},"
|
|
||||||
|
|
||||||
launchTestnet() {
|
|
||||||
declare nodeCount=$1
|
|
||||||
echo --- setup "$nodeCount" node test
|
|
||||||
|
|
||||||
# shellcheck disable=SC2068
|
|
||||||
net/gce.sh create \
|
|
||||||
-d pd-ssd \
|
|
||||||
-n "$nodeCount" -c "$CLIENT_COUNT" \
|
|
||||||
-G "$LEADER_CPU_MACHINE_TYPE" \
|
|
||||||
-p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/-z } "$ADDITIONAL_FLAGS"
|
|
||||||
|
|
||||||
echo --- configure database
|
|
||||||
net/init-metrics.sh -e
|
|
||||||
|
|
||||||
echo --- start "$nodeCount" node test
|
|
||||||
if [[ -n $USE_PREBUILT_CHANNEL_TARBALL ]]; then
|
|
||||||
net/net.sh start -o noValidatorSanity -t "$CHANNEL"
|
|
||||||
else
|
|
||||||
net/net.sh start -o noValidatorSanity -T solana-release*.tar.bz2
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo --- wait "$ITERATION_WAIT" seconds to complete test
|
|
||||||
sleep "$ITERATION_WAIT"
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
declare q_mean_tps='
|
|
||||||
SELECT round(mean("sum_count")) AS "mean_tps" FROM (
|
|
||||||
SELECT sum("count") AS "sum_count"
|
|
||||||
FROM "testnet-automation"."autogen"."banking_stage-record_transactions"
|
|
||||||
WHERE time > now() - 300s GROUP BY time(1s)
|
|
||||||
)'
|
|
||||||
|
|
||||||
declare q_max_tps='
|
|
||||||
SELECT round(max("sum_count")) AS "max_tps" FROM (
|
|
||||||
SELECT sum("count") AS "sum_count"
|
|
||||||
FROM "testnet-automation"."autogen"."banking_stage-record_transactions"
|
|
||||||
WHERE time > now() - 300s GROUP BY time(1s)
|
|
||||||
)'
|
|
||||||
|
|
||||||
declare q_mean_confirmation='
|
|
||||||
SELECT round(mean("duration_ms")) as "mean_confirmation"
|
|
||||||
FROM "testnet-automation"."autogen"."validator-confirmation"
|
|
||||||
WHERE time > now() - 300s'
|
|
||||||
|
|
||||||
declare q_max_confirmation='
|
|
||||||
SELECT round(max("duration_ms")) as "max_confirmation"
|
|
||||||
FROM "testnet-automation"."autogen"."validator-confirmation"
|
|
||||||
WHERE time > now() - 300s'
|
|
||||||
|
|
||||||
declare q_99th_confirmation='
|
|
||||||
SELECT round(percentile("duration_ms", 99)) as "99th_confirmation"
|
|
||||||
FROM "testnet-automation"."autogen"."validator-confirmation"
|
|
||||||
WHERE time > now() - 300s'
|
|
||||||
|
|
||||||
curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
|
|
||||||
--data-urlencode "db=testnet-automation" \
|
|
||||||
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" |
|
|
||||||
python ci/testnet-automation-json-parser.py >>TPS"$nodeCount".log
|
|
||||||
|
|
||||||
upload-ci-artifact TPS"$nodeCount".log
|
|
||||||
}
|
|
||||||
|
|
||||||
# This is needed, because buildkite doesn't let us define an array of numbers.
|
|
||||||
# The array is defined as a space separated string of numbers
|
|
||||||
# shellcheck disable=SC2206
|
|
||||||
nodes_count_array=($NUMBER_OF_NODES)
|
|
||||||
|
|
||||||
for n in "${nodes_count_array[@]}"; do
|
|
||||||
launchTestnet "$n"
|
|
||||||
done
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
steps:
|
||||||
|
- command: "system-test/testnet-performance/testnet-automation.sh"
|
||||||
|
label: "COLO performance testnet GPU enabled"
|
||||||
|
env:
|
||||||
|
CLOUD_PROVIDER: "colo"
|
||||||
|
TESTNET_TAG: "colo-edge-perf-gpu-enabled"
|
||||||
|
RAMP_UP_TIME: 60
|
||||||
|
TEST_DURATION: 300
|
||||||
|
NUMBER_OF_VALIDATOR_NODES: 4
|
||||||
|
NUMBER_OF_CLIENT_NODES: 2
|
||||||
|
CLIENT_OPTIONS: "bench-tps=2=--tx_count 80000 --thread-batch-sleep-ms 1000"
|
||||||
|
ADDITIONAL_FLAGS: ""
|
||||||
|
agents:
|
||||||
|
- "queue=colo-deploy"
|
|
@ -0,0 +1,16 @@
|
||||||
|
steps:
|
||||||
|
- command: "system-test/testnet-performance/testnet-automation.sh"
|
||||||
|
label: "GCE performance testnets CPU ONLY"
|
||||||
|
env:
|
||||||
|
CLOUD_PROVIDER: "gce"
|
||||||
|
TESTNET_TAG: "gce-edge-perf-cpu-only"
|
||||||
|
RAMP_UP_TIME: 60
|
||||||
|
TEST_DURATION: 300
|
||||||
|
NUMBER_OF_VALIDATOR_NODES: 10
|
||||||
|
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
|
||||||
|
NUMBER_OF_CLIENT_NODES: 1
|
||||||
|
CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000"
|
||||||
|
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
|
||||||
|
ADDITIONAL_FLAGS: ""
|
||||||
|
agents:
|
||||||
|
- "queue=testnet-deploy"
|
|
@ -0,0 +1,16 @@
|
||||||
|
steps:
|
||||||
|
- command: "system-test/testnet-performance/testnet-automation.sh"
|
||||||
|
label: "GCE performance testnets GPU ENABLED"
|
||||||
|
env:
|
||||||
|
CLOUD_PROVIDER: "gce"
|
||||||
|
TESTNET_TAG: "gce-edge-perf-gpu-enabled"
|
||||||
|
RAMP_UP_TIME: 60
|
||||||
|
TEST_DURATION: 300
|
||||||
|
NUMBER_OF_VALIDATOR_NODES: 10
|
||||||
|
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
|
||||||
|
NUMBER_OF_CLIENT_NODES: 1
|
||||||
|
CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000"
|
||||||
|
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
|
||||||
|
ADDITIONAL_FLAGS: ""
|
||||||
|
agents:
|
||||||
|
- "queue=testnet-deploy"
|
|
@ -0,0 +1,178 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# TODO: Make sure a dB named $TESTNET_TAG exists in the influxDB host, or can be created
|
||||||
|
[[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation
|
||||||
|
[[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://metrics.solana.com:8086
|
||||||
|
|
||||||
|
# TODO: Remove all default values, force explicitness in the testcase definition
|
||||||
|
[[ -n $TEST_DURATION ]] || TEST_DURATION=300
|
||||||
|
[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=60
|
||||||
|
[[ -n $NUMBER_OF_VALIDATOR_NODES ]] || NUMBER_OF_VALIDATOR_NODES=2
|
||||||
|
[[ -n $NUMBER_OF_CLIENT_NODES ]] || NUMBER_OF_CLIENT_NODES=1
|
||||||
|
[[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-a"
|
||||||
|
|
||||||
|
function collect_logs {
|
||||||
|
echo --- collect logs from remote nodes
|
||||||
|
rm -rf net/log
|
||||||
|
net/net.sh logs
|
||||||
|
for logfile in net/log/* ; do
|
||||||
|
(
|
||||||
|
new_log=net/log/"$TESTNET_TAG"_"$NUMBER_OF_VALIDATOR_NODES"-nodes_"$(basename "$logfile")"
|
||||||
|
cp "$logfile" "$new_log"
|
||||||
|
upload-ci-artifact "$new_log"
|
||||||
|
)
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanup_testnet {
|
||||||
|
(
|
||||||
|
set +e
|
||||||
|
collect_logs
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
set +e
|
||||||
|
echo --- Stop Network Software
|
||||||
|
net/net.sh stop
|
||||||
|
)
|
||||||
|
|
||||||
|
case $CLOUD_PROVIDER in
|
||||||
|
gce)
|
||||||
|
(
|
||||||
|
cat <<EOF
|
||||||
|
- wait: ~
|
||||||
|
continue_on_failure: true
|
||||||
|
|
||||||
|
- command: "net/gce.sh delete -p ${TESTNET_TAG}"
|
||||||
|
label: "Delete Testnet"
|
||||||
|
agents:
|
||||||
|
- "queue=testnet-deploy"
|
||||||
|
EOF
|
||||||
|
) | buildkite-agent pipeline upload
|
||||||
|
;;
|
||||||
|
colo)
|
||||||
|
(
|
||||||
|
cat <<EOF
|
||||||
|
- wait: ~
|
||||||
|
continue_on_failure: true
|
||||||
|
|
||||||
|
- command: "net/colo.sh delete -p ${TESTNET_TAG}"
|
||||||
|
label: "Delete Testnet"
|
||||||
|
agents:
|
||||||
|
- "queue=colo-deploy"
|
||||||
|
EOF
|
||||||
|
) | buildkite-agent pipeline upload
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
trap cleanup_testnet EXIT
|
||||||
|
|
||||||
|
launchTestnet() {
|
||||||
|
set -x
|
||||||
|
|
||||||
|
# shellcheck disable=SC2068
|
||||||
|
echo --- create "$NUMBER_OF_VALIDATOR_NODES" nodes
|
||||||
|
|
||||||
|
case $CLOUD_PROVIDER in
|
||||||
|
gce)
|
||||||
|
# shellcheck disable=SC2068
|
||||||
|
net/gce.sh create \
|
||||||
|
-d pd-ssd \
|
||||||
|
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
||||||
|
"$maybeMachineType" "$VALIDATOR_NODE_MACHINE_TYPE" \
|
||||||
|
-p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} "$ADDITIONAL_FLAGS"
|
||||||
|
;;
|
||||||
|
colo)
|
||||||
|
net/colo.sh create \
|
||||||
|
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" -g \
|
||||||
|
-p "$TESTNET_TAG" "$ADDITIONAL_FLAGS"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo --- configure database
|
||||||
|
net/init-metrics.sh -e
|
||||||
|
|
||||||
|
echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test
|
||||||
|
if [[ -n $CHANNEL ]]; then
|
||||||
|
net/net.sh start -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
|
||||||
|
else
|
||||||
|
net/net.sh start -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize
|
||||||
|
sleep "$RAMP_UP_TIME"
|
||||||
|
|
||||||
|
echo --- wait "$TEST_DURATION" seconds to complete test
|
||||||
|
sleep "$TEST_DURATION"
|
||||||
|
|
||||||
|
echo --- collect statistics about run
|
||||||
|
declare q_mean_tps='
|
||||||
|
SELECT round(mean("sum_count")) AS "mean_tps" FROM (
|
||||||
|
SELECT sum("count") AS "sum_count"
|
||||||
|
FROM "'$TESTNET_TAG'"."autogen"."banking_stage-record_transactions"
|
||||||
|
WHERE time > now() - '"$TEST_DURATION"'s GROUP BY time(1s)
|
||||||
|
)'
|
||||||
|
|
||||||
|
declare q_max_tps='
|
||||||
|
SELECT round(max("sum_count")) AS "max_tps" FROM (
|
||||||
|
SELECT sum("count") AS "sum_count"
|
||||||
|
FROM "'$TESTNET_TAG'"."autogen"."banking_stage-record_transactions"
|
||||||
|
WHERE time > now() - '"$TEST_DURATION"'s GROUP BY time(1s)
|
||||||
|
)'
|
||||||
|
|
||||||
|
declare q_mean_confirmation='
|
||||||
|
SELECT round(mean("duration_ms")) as "mean_confirmation"
|
||||||
|
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||||
|
WHERE time > now() - '"$TEST_DURATION"'s'
|
||||||
|
|
||||||
|
declare q_max_confirmation='
|
||||||
|
SELECT round(max("duration_ms")) as "max_confirmation"
|
||||||
|
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||||
|
WHERE time > now() - '"$TEST_DURATION"'s'
|
||||||
|
|
||||||
|
declare q_99th_confirmation='
|
||||||
|
SELECT round(percentile("duration_ms", 99)) as "99th_confirmation"
|
||||||
|
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||||
|
WHERE time > now() - '"$TEST_DURATION"'s'
|
||||||
|
|
||||||
|
RESULTS_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
|
||||||
|
curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
|
||||||
|
--data-urlencode "db=${TESTNET_TAG}" \
|
||||||
|
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" |
|
||||||
|
python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULTS_FILE"
|
||||||
|
|
||||||
|
upload-ci-artifact "$RESULTS_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
cd "$(dirname "$0")/../.."
|
||||||
|
|
||||||
|
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
|
||||||
|
if [[ -z $SOLANA_METRICS_PARTIAL_CONFIG ]]; then
|
||||||
|
echo SOLANA_METRICS_PARTIAL_CONFIG not defined
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export SOLANA_METRICS_CONFIG="db=$TESTNET_TAG,host=$INFLUX_HOST,$SOLANA_METRICS_PARTIAL_CONFIG"
|
||||||
|
fi
|
||||||
|
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
|
||||||
|
|
||||||
|
if [[ -z $CHANNEL ]]; then
|
||||||
|
echo --- downloading tar from build artifacts
|
||||||
|
buildkite-agent artifact download "solana-release*.tar.bz2" .
|
||||||
|
fi
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source ci/upload-ci-artifact.sh
|
||||||
|
|
||||||
|
maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
|
||||||
|
maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"}
|
||||||
|
|
||||||
|
IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
|
||||||
|
|
||||||
|
launchTestnet
|
Loading…
Reference in New Issue