Push perf test results to slack app (#6371)

* Add script to publish testnet results to slack * Obscure webhook URL * fixup * Replace read with cat redirection * Turn back on net restart * Pick nits * Make symlink before trying to delete its contents * Display test config in slack and pick Trents nit not to maybe rm -rf /* * Clean up results print * Minor nits * Turn the test settings back up to 11 * typo * Shellcheck * Just a few more fields * fix payload formatting * Del clear-config.sh * Mount secondary * Add commit SHA link and Grafana time range URL * Add fancy buttons instead of text URLs * Tighten up test config display * Fixup display nits * chellsheck * Rebase and fix typo
2019-10-21 20:00:17 -04:00 · 2019-10-21 20:00:17 -04:00 · 00809a67c0
parent d1b18a5060
commit 00809a67c0
8 changed files with 167 additions and 27 deletions
--- a/net/common.sh
+++ b/net/common.sh
@ -113,11 +113,14 @@ clear_config_dir() {
 SECONDARY_DISK_MOUNT_POINT=/mnt/extra-disk
 setup_secondary_mount() {
  # If there is a secondary disk, symlink the config/ dir there
-  if [[ -d $SECONDARY_DISK_MOUNT_POINT ]] && \
-    [[ -w $SECONDARY_DISK_MOUNT_POINT ]]; then
-    mkdir -p $SECONDARY_DISK_MOUNT_POINT/config
-    rm -rf "$SOLANA_CONFIG_DIR"
-    ln -sfT $SECONDARY_DISK_MOUNT_POINT/config "$SOLANA_CONFIG_DIR"
-  fi
+  (
+    set -x
+    if [[ -d $SECONDARY_DISK_MOUNT_POINT ]] && \
+      [[ -w $SECONDARY_DISK_MOUNT_POINT ]]; then
+      mkdir -p $SECONDARY_DISK_MOUNT_POINT/config
+      rm -rf "$SOLANA_CONFIG_DIR"
+      ln -sfT $SECONDARY_DISK_MOUNT_POINT/config "$SOLANA_CONFIG_DIR"
+    fi
+  )
 }

--- a/net/remote/remote-node.sh
+++ b/net/remote/remote-node.sh
@ -152,6 +152,7 @@ EOF
    set -x
    if [[ $skipSetup != true ]]; then
      clear_config_dir "$SOLANA_CONFIG_DIR"
+      setup_secondary_mount

      if [[ -n $internalNodesLamports ]]; then
        echo "---" >> config/fullnode-balances.yml
@ -248,6 +249,7 @@ EOF
    fi
    if [[ $skipSetup != true ]]; then
      clear_config_dir "$SOLANA_CONFIG_DIR"
+      setup_secondary_mount
      [[ -z $internalNodesLamports ]] || net/scripts/rsync-retry.sh -vPrc \
      "$entrypointIp":~/solana/config/fullnode-"$nodeIndex"-identity.json config/fullnode-identity.json
    fi
--- a/system-test/testnet-performance/colo-gpu-perf.yml
+++ b/system-test/testnet-performance/colo-gpu-perf.yml
@ -2,13 +2,14 @@ steps:
  - command: "system-test/testnet-performance/testnet-automation.sh"
    label: "COLO performance testnet GPU enabled"
    env:
+      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "colo"
      TESTNET_TAG: "colo-edge-perf-gpu-enabled"
-      RAMP_UP_TIME: 60
-      TEST_DURATION: 300
+      RAMP_UP_TIME: 0
+      TEST_DURATION: 600
      NUMBER_OF_VALIDATOR_NODES: 4
      NUMBER_OF_CLIENT_NODES: 2
-      CLIENT_OPTIONS: "bench-tps=2=--tx_count 80000 --thread-batch-sleep-ms 1000"
+      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
      ADDITIONAL_FLAGS: ""
    agents:
      - "queue=colo-deploy"
--- a/system-test/testnet-performance/gce-cpu-only-perf.yml
+++ b/system-test/testnet-performance/gce-cpu-only-perf.yml
@ -2,6 +2,7 @@ steps:
  - command: "system-test/testnet-performance/testnet-automation.sh"
    label: "GCE performance testnets CPU ONLY"
    env:
+      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "gce"
      TESTNET_TAG: "gce-edge-perf-cpu-only"
      RAMP_UP_TIME: 60
--- a/system-test/testnet-performance/gce-gpu-perf.yml
+++ b/system-test/testnet-performance/gce-gpu-perf.yml
@ -2,14 +2,15 @@ steps:
  - command: "system-test/testnet-performance/testnet-automation.sh"
    label: "GCE performance testnets GPU ENABLED"
    env:
+      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "gce"
      TESTNET_TAG: "gce-edge-perf-gpu-enabled"
-      RAMP_UP_TIME: 60
-      TEST_DURATION: 300
-      NUMBER_OF_VALIDATOR_NODES: 10
+      RAMP_UP_TIME: 0
+      TEST_DURATION: 600
+      NUMBER_OF_VALIDATOR_NODES: 50
      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
-      NUMBER_OF_CLIENT_NODES: 1
-      CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000"
+      NUMBER_OF_CLIENT_NODES: 2
+      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
      TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
      ADDITIONAL_FLAGS: ""
    agents:
--- a/system-test/testnet-performance/testnet-automation-json-parser.py
+++ b/system-test/testnet-performance/testnet-automation-json-parser.py
@ -2,6 +2,9 @@
 import sys, json

 data=json.load(sys.stdin)
-print[\
-   ([result['series'][0]['columns'][1].encode(), result['series'][0]['values'][0][1]]) \
-   for result in data['results']]
+
+if 'results' in data:
+   for result in data['results']:
+      print result['series'][0]['columns'][1].encode() + ': ' + str(result['series'][0]['values'][0][1])
+else:
+   print "No results returned from CURL request"
--- a/system-test/testnet-performance/testnet-automation.sh
+++ b/system-test/testnet-performance/testnet-automation.sh
@ -7,10 +7,9 @@ set -e

 # TODO: Remove all default values, force explicitness in the testcase definition
 [[ -n $TEST_DURATION ]] || TEST_DURATION=300
-[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=60
+[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=0
 [[ -n $NUMBER_OF_VALIDATOR_NODES ]] || NUMBER_OF_VALIDATOR_NODES=2
 [[ -n $NUMBER_OF_CLIENT_NODES ]] || NUMBER_OF_CLIENT_NODES=1
-[[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-a"

 function collect_logs {
  echo --- collect logs from remote nodes
@ -26,6 +25,11 @@ function collect_logs {
 }

 function cleanup_testnet {
+  FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
+  if [[ -n $UPLOAD_RESULTS_TO_SLACK ]] ; then
+    upload_results_to_slack
+  fi
+
  (
    set +e
    collect_logs
@ -101,9 +105,9 @@ launchTestnet() {

  echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test
  if [[ -n $CHANNEL ]]; then
-    net/net.sh start -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
+    net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
  else
-    net/net.sh start -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
+    net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
  fi

  echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize
@ -128,27 +132,27 @@ launchTestnet() {
    )'

  declare q_mean_confirmation='
-    SELECT round(mean("duration_ms")) as "mean_confirmation"
+    SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
      WHERE time > now() - '"$TEST_DURATION"'s'

  declare q_max_confirmation='
-    SELECT round(max("duration_ms")) as "max_confirmation"
+    SELECT round(max("duration_ms")) as "max_confirmation_ms"
      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
      WHERE time > now() - '"$TEST_DURATION"'s'

  declare q_99th_confirmation='
-    SELECT round(percentile("duration_ms", 99)) as "99th_confirmation"
+    SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
      WHERE time > now() - '"$TEST_DURATION"'s'

-  RESULTS_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
  curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
    --data-urlencode "db=${TESTNET_TAG}" \
    --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" |
-    python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULTS_FILE"
+    python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULT_FILE"

-  upload-ci-artifact "$RESULTS_FILE"
+  RESULT_DETAILS=$(<"$RESULT_FILE")
+  upload-ci-artifact "$RESULT_FILE"
 }

 cd "$(dirname "$0")/../.."
@ -169,10 +173,33 @@ fi

 # shellcheck disable=SC1091
 source ci/upload-ci-artifact.sh
+source system-test/testnet-performance/upload_results_to_slack.sh

 maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
 maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"}

 IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"

+RESULT_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
+rm -f $RESULT_FILE
+RESULT_DETAILS="Test failed to finish"
+
+TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
+                        NUMBER_OF_VALIDATOR_NODES \
+                        VALIDATOR_NODE_MACHINE_TYPE \
+                        NUMBER_OF_CLIENT_NODES \
+                        CLIENT_OPTIONS \
+                        TESTNET_ZONES \
+                        TEST_DURATION \
+                        ADDITIONAL_FLAGS)
+
+TEST_CONFIGURATION=
+for i in "${TEST_PARAMS_TO_DISPLAY[@]}" ; do
+  if [[ -n ${!i} ]] ; then
+    TEST_CONFIGURATION+="${i} = ${!i} | "
+  fi
+done
+
+START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
+
 launchTestnet
--- a/system-test/testnet-performance/upload_results_to_slack.sh
+++ b/system-test/testnet-performance/upload_results_to_slack.sh
@ -0,0 +1,102 @@
+upload_results_to_slack() {
+  echo --- Uploading results to Slack Performance Results App
+
+  if [[ -z $SLACK_WEBHOOK_URL ]] ; then
+    echo "SLACK_WEBHOOOK_URL undefined"
+    exit 1
+  fi
+
+  [[ -n $BUILDKITE_MESSAGE ]] || BUILDKITE_MESSAGE="Message not defined"
+
+  if [[ -n $BUILDKITE_COMMIT ]] ; then
+    COMMIT_BUTTON_TEXT="$(echo "$BUILDKITE_COMMIT" | head -c 8)"
+    COMMIT_URL="https://github.com/solana-labs/solana/commit/${BUILDKITE_COMMIT}"
+  else
+    COMMIT_BUTTON_TEXT="Commit not defined"
+    COMMIT_URL="https://github.com/solana-labs/solana/commits/master"
+  fi
+
+  if [[ -n $BUILDKITE_BUILD_URL ]] ; then
+    BUILD_BUTTON_TEXT="Build Kite Job"
+  else
+    BUILD_BUTTON_TEXT="Build URL not defined"
+    BUILDKITE_BUILD_URL="https://buildkite.com/solana-labs/"
+  fi
+
+  GRAFANA_URL="https://metrics.solana.com:3000/d/testnet-${CHANNEL:-edge}/testnet-monitor-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${START_UNIX_MSECS:-0}&to=${FINISH_UNIX_MSECS:-0}"
+
+  [[ -n $RESULT_DETAILS ]] || RESULT_DETAILS="Undefined"
+  [[ -n $TEST_CONFIGURATION ]] || TEST_CONFIGURATION="Undefined"
+
+  payLoad="$(cat <<EOF
+{
+"blocks": [
+ 		{
+			"type": "section",
+			"text": {
+				"type": "mrkdwn",
+				"text": "*New Build: $BUILDKITE_MESSAGE*"
+			}
+		},
+    {
+			"type": "actions",
+			"elements": [
+				{
+					"type": "button",
+					"text": {
+						"type": "plain_text",
+						"text": "$COMMIT_BUTTON_TEXT",
+						"emoji": true
+					},
+					"url": "$COMMIT_URL"
+				},
+        {
+					"type": "button",
+					"text": {
+						"type": "plain_text",
+						"text": "$BUILD_BUTTON_TEXT",
+						"emoji": true
+					},
+					"url": "$BUILDKITE_BUILD_URL"
+				},
+        {
+					"type": "button",
+					"text": {
+						"type": "plain_text",
+						"text": "Grafana",
+						"emoji": true
+					},
+					"url": "$GRAFANA_URL"
+				}
+			]
+		},
+		{
+			"type": "divider"
+    },
+    {
+			"type": "section",
+			"text": {
+				"type": "mrkdwn",
+				"text": "Test Configuration: \n\`\`\`$TEST_CONFIGURATION\`\`\`"
+			}
+		},
+		{
+			"type": "divider"
+		},
+ 		{
+			"type": "section",
+			"text": {
+				"type": "mrkdwn",
+				"text": "Result Details: \n\`\`\`$RESULT_DETAILS\`\`\`"
+			}
+		}
+	]
+}
+EOF
+)"
+
+  curl -X POST \
+  -H 'Content-type: application/json' \
+  --data "$payLoad" \
+  "$SLACK_WEBHOOK_URL"
+}