diff --git a/net/common.sh b/net/common.sh index ebb703f00..14d2d3369 100644 --- a/net/common.sh +++ b/net/common.sh @@ -113,11 +113,14 @@ clear_config_dir() { SECONDARY_DISK_MOUNT_POINT=/mnt/extra-disk setup_secondary_mount() { # If there is a secondary disk, symlink the config/ dir there - if [[ -d $SECONDARY_DISK_MOUNT_POINT ]] && \ - [[ -w $SECONDARY_DISK_MOUNT_POINT ]]; then - mkdir -p $SECONDARY_DISK_MOUNT_POINT/config - rm -rf "$SOLANA_CONFIG_DIR" - ln -sfT $SECONDARY_DISK_MOUNT_POINT/config "$SOLANA_CONFIG_DIR" - fi + ( + set -x + if [[ -d $SECONDARY_DISK_MOUNT_POINT ]] && \ + [[ -w $SECONDARY_DISK_MOUNT_POINT ]]; then + mkdir -p $SECONDARY_DISK_MOUNT_POINT/config + rm -rf "$SOLANA_CONFIG_DIR" + ln -sfT $SECONDARY_DISK_MOUNT_POINT/config "$SOLANA_CONFIG_DIR" + fi + ) } diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index abd106571..aa23d6d0f 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -152,6 +152,7 @@ EOF set -x if [[ $skipSetup != true ]]; then clear_config_dir "$SOLANA_CONFIG_DIR" + setup_secondary_mount if [[ -n $internalNodesLamports ]]; then echo "---" >> config/fullnode-balances.yml @@ -248,6 +249,7 @@ EOF fi if [[ $skipSetup != true ]]; then clear_config_dir "$SOLANA_CONFIG_DIR" + setup_secondary_mount [[ -z $internalNodesLamports ]] || net/scripts/rsync-retry.sh -vPrc \ "$entrypointIp":~/solana/config/fullnode-"$nodeIndex"-identity.json config/fullnode-identity.json fi diff --git a/system-test/testnet-performance/colo-gpu-perf.yml b/system-test/testnet-performance/colo-gpu-perf.yml index 7e2f982fc..07a926025 100755 --- a/system-test/testnet-performance/colo-gpu-perf.yml +++ b/system-test/testnet-performance/colo-gpu-perf.yml @@ -2,13 +2,14 @@ steps: - command: "system-test/testnet-performance/testnet-automation.sh" label: "COLO performance testnet GPU enabled" env: + UPLOAD_RESULTS_TO_SLACK: "true" CLOUD_PROVIDER: "colo" TESTNET_TAG: "colo-edge-perf-gpu-enabled" - RAMP_UP_TIME: 60 - TEST_DURATION: 300 + RAMP_UP_TIME: 0 + TEST_DURATION: 600 NUMBER_OF_VALIDATOR_NODES: 4 NUMBER_OF_CLIENT_NODES: 2 - CLIENT_OPTIONS: "bench-tps=2=--tx_count 80000 --thread-batch-sleep-ms 1000" + CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" ADDITIONAL_FLAGS: "" agents: - "queue=colo-deploy" diff --git a/system-test/testnet-performance/gce-cpu-only-perf.yml b/system-test/testnet-performance/gce-cpu-only-perf.yml index 1047712cd..f23b79f60 100755 --- a/system-test/testnet-performance/gce-cpu-only-perf.yml +++ b/system-test/testnet-performance/gce-cpu-only-perf.yml @@ -2,6 +2,7 @@ steps: - command: "system-test/testnet-performance/testnet-automation.sh" label: "GCE performance testnets CPU ONLY" env: + UPLOAD_RESULTS_TO_SLACK: "true" CLOUD_PROVIDER: "gce" TESTNET_TAG: "gce-edge-perf-cpu-only" RAMP_UP_TIME: 60 diff --git a/system-test/testnet-performance/gce-gpu-perf.yml b/system-test/testnet-performance/gce-gpu-perf.yml index 755cd6ca7..1c22e9283 100755 --- a/system-test/testnet-performance/gce-gpu-perf.yml +++ b/system-test/testnet-performance/gce-gpu-perf.yml @@ -2,14 +2,15 @@ steps: - command: "system-test/testnet-performance/testnet-automation.sh" label: "GCE performance testnets GPU ENABLED" env: + UPLOAD_RESULTS_TO_SLACK: "true" CLOUD_PROVIDER: "gce" TESTNET_TAG: "gce-edge-perf-gpu-enabled" - RAMP_UP_TIME: 60 - TEST_DURATION: 300 - NUMBER_OF_VALIDATOR_NODES: 10 + RAMP_UP_TIME: 0 + TEST_DURATION: 600 + NUMBER_OF_VALIDATOR_NODES: 50 VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" - NUMBER_OF_CLIENT_NODES: 1 - CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000" + NUMBER_OF_CLIENT_NODES: 2 + CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250" TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" ADDITIONAL_FLAGS: "" agents: diff --git a/system-test/testnet-performance/testnet-automation-json-parser.py b/system-test/testnet-performance/testnet-automation-json-parser.py index 2ff8d5e17..128071612 100755 --- a/system-test/testnet-performance/testnet-automation-json-parser.py +++ b/system-test/testnet-performance/testnet-automation-json-parser.py @@ -2,6 +2,9 @@ import sys, json data=json.load(sys.stdin) -print[\ - ([result['series'][0]['columns'][1].encode(), result['series'][0]['values'][0][1]]) \ - for result in data['results']] + +if 'results' in data: + for result in data['results']: + print result['series'][0]['columns'][1].encode() + ': ' + str(result['series'][0]['values'][0][1]) +else: + print "No results returned from CURL request" diff --git a/system-test/testnet-performance/testnet-automation.sh b/system-test/testnet-performance/testnet-automation.sh index f7e93a841..ef67a8bad 100755 --- a/system-test/testnet-performance/testnet-automation.sh +++ b/system-test/testnet-performance/testnet-automation.sh @@ -7,10 +7,9 @@ set -e # TODO: Remove all default values, force explicitness in the testcase definition [[ -n $TEST_DURATION ]] || TEST_DURATION=300 -[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=60 +[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=0 [[ -n $NUMBER_OF_VALIDATOR_NODES ]] || NUMBER_OF_VALIDATOR_NODES=2 [[ -n $NUMBER_OF_CLIENT_NODES ]] || NUMBER_OF_CLIENT_NODES=1 -[[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-a" function collect_logs { echo --- collect logs from remote nodes @@ -26,6 +25,11 @@ function collect_logs { } function cleanup_testnet { + FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))" + if [[ -n $UPLOAD_RESULTS_TO_SLACK ]] ; then + upload_results_to_slack + fi + ( set +e collect_logs @@ -101,9 +105,9 @@ launchTestnet() { echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test if [[ -n $CHANNEL ]]; then - net/net.sh start -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" + net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" else - net/net.sh start -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" + net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" fi echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize @@ -128,27 +132,27 @@ launchTestnet() { )' declare q_mean_confirmation=' - SELECT round(mean("duration_ms")) as "mean_confirmation" + SELECT round(mean("duration_ms")) as "mean_confirmation_ms" FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" WHERE time > now() - '"$TEST_DURATION"'s' declare q_max_confirmation=' - SELECT round(max("duration_ms")) as "max_confirmation" + SELECT round(max("duration_ms")) as "max_confirmation_ms" FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" WHERE time > now() - '"$TEST_DURATION"'s' declare q_99th_confirmation=' - SELECT round(percentile("duration_ms", 99)) as "99th_confirmation" + SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms" FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" WHERE time > now() - '"$TEST_DURATION"'s' - RESULTS_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ --data-urlencode "db=${TESTNET_TAG}" \ --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" | - python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULTS_FILE" + python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULT_FILE" - upload-ci-artifact "$RESULTS_FILE" + RESULT_DETAILS=$(<"$RESULT_FILE") + upload-ci-artifact "$RESULT_FILE" } cd "$(dirname "$0")/../.." @@ -169,10 +173,33 @@ fi # shellcheck disable=SC1091 source ci/upload-ci-artifact.sh +source system-test/testnet-performance/upload_results_to_slack.sh maybeClientOptions=${CLIENT_OPTIONS:+"-c"} maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"} IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}" +RESULT_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log +rm -f $RESULT_FILE +RESULT_DETAILS="Test failed to finish" + +TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \ + NUMBER_OF_VALIDATOR_NODES \ + VALIDATOR_NODE_MACHINE_TYPE \ + NUMBER_OF_CLIENT_NODES \ + CLIENT_OPTIONS \ + TESTNET_ZONES \ + TEST_DURATION \ + ADDITIONAL_FLAGS) + +TEST_CONFIGURATION= +for i in "${TEST_PARAMS_TO_DISPLAY[@]}" ; do + if [[ -n ${!i} ]] ; then + TEST_CONFIGURATION+="${i} = ${!i} | " + fi +done + +START_UNIX_MSECS="$(($(date +%s%N)/1000000))" + launchTestnet diff --git a/system-test/testnet-performance/upload_results_to_slack.sh b/system-test/testnet-performance/upload_results_to_slack.sh new file mode 100755 index 000000000..910b6bbe9 --- /dev/null +++ b/system-test/testnet-performance/upload_results_to_slack.sh @@ -0,0 +1,102 @@ +upload_results_to_slack() { + echo --- Uploading results to Slack Performance Results App + + if [[ -z $SLACK_WEBHOOK_URL ]] ; then + echo "SLACK_WEBHOOOK_URL undefined" + exit 1 + fi + + [[ -n $BUILDKITE_MESSAGE ]] || BUILDKITE_MESSAGE="Message not defined" + + if [[ -n $BUILDKITE_COMMIT ]] ; then + COMMIT_BUTTON_TEXT="$(echo "$BUILDKITE_COMMIT" | head -c 8)" + COMMIT_URL="https://github.com/solana-labs/solana/commit/${BUILDKITE_COMMIT}" + else + COMMIT_BUTTON_TEXT="Commit not defined" + COMMIT_URL="https://github.com/solana-labs/solana/commits/master" + fi + + if [[ -n $BUILDKITE_BUILD_URL ]] ; then + BUILD_BUTTON_TEXT="Build Kite Job" + else + BUILD_BUTTON_TEXT="Build URL not defined" + BUILDKITE_BUILD_URL="https://buildkite.com/solana-labs/" + fi + + GRAFANA_URL="https://metrics.solana.com:3000/d/testnet-${CHANNEL:-edge}/testnet-monitor-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${START_UNIX_MSECS:-0}&to=${FINISH_UNIX_MSECS:-0}" + + [[ -n $RESULT_DETAILS ]] || RESULT_DETAILS="Undefined" + [[ -n $TEST_CONFIGURATION ]] || TEST_CONFIGURATION="Undefined" + + payLoad="$(cat <