From 66ff6026593d239f88a1b6bbb28aad0fa9932a1f Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Thu, 6 Sep 2018 10:34:24 -0700 Subject: [PATCH] Rewrite ci/testnet-{deploy,sanity}.sh in terms of net/ primitives --- ci/testnet-deploy.sh | 549 +++++++------------------------------------ ci/testnet-sanity.sh | 88 ++----- net/gce.sh | 2 +- 3 files changed, 106 insertions(+), 533 deletions(-) diff --git a/ci/testnet-deploy.sh b/ci/testnet-deploy.sh index c0189bc76..d430fe479 100755 --- a/ci/testnet-deploy.sh +++ b/ci/testnet-deploy.sh @@ -1,484 +1,103 @@ #!/bin/bash -e -# -# Deploys the Solana software running on the testnet full nodes -# -# This script must be run by a user/machine that has successfully authenticated -# with GCP and has sufficient permission. -# -here=$(dirname "$0") -metrics_write_datapoint="$here"/../scripts/metrics-write-datapoint.sh +cd "$(dirname "$0")"/.. -# shellcheck source=scripts/gcloud.sh -source "$here"/../scripts/gcloud.sh +zone= +leaderAddress= +clientNodeCount=0 +publicNetwork=false +snapChannel=edge +delete=false -# TODO: Switch over to rolling updates -ROLLING_UPDATE=false -#ROLLING_UPDATE=true +usage() { + exitcode=0 + if [[ -n "$1" ]]; then + exitcode=1 + echo "Error: $*" + fi + cat <> "log-$vmName.txt" 2>&1 & - declare pid=$! - - # Rename log file so it can be discovered later by $pid - mv "log-$vmName.txt" "log-$pid.txt" - pids+=("$pid") -} - - -echo "Validator nodes (unverified):" -findVms validator "name~^$SOLANA_NET_NAME-validator-" -pids=() -vm_foreach_in_class validator delete_unreachable_validators -wait_for_pids validator sanity check -vmlist=() - -echo "Leader node:" -findVms leader "name=$SOLANA_NET_NAME" -[[ ${#vmlist[@]} = 1 ]] || { - echo "Unable to find $SOLANA_NET_NAME" - exit 1 -} - -echo "Client node(s):" -findVms client "name~^$SOLANA_NET_NAME-client" - -echo "Validator nodes:" -findVms validator "name~^$SOLANA_NET_NAME-validator-" - -fullnode_count=0 -inc_fullnode_count() { - fullnode_count=$((fullnode_count + 1)) -} -vm_foreach_in_class leader inc_fullnode_count -vm_foreach_in_class validator inc_fullnode_count - -# Add "network stopping" datapoint -$metrics_write_datapoint "testnet-deploy,name=$netBasename stop=1" - -client_start() { - declare vmName=$1 - declare vmZone=$2 - declare vmPublicIp=$3 - declare count=$4 - - nodeConfig="\ - rust-log=$RUST_LOG \ - default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \ - metrics-config=$SOLANA_METRICS_CONFIG \ - setup-args=$SOLANA_SETUP_ARGS \ - leader-ip=$publicIp \ - " - - vm_exec "$vmName" "$vmZone" "$vmPublicIp" \ - "Starting client $count:" \ - "\ - set -x; - sudo snap set solana $nodeConfig; \ - snap info solana; \ - sudo snap get solana; \ - threadCount=\$(nproc); \ - if [[ \$threadCount -gt 4 ]]; then threadCount=4; fi; \ - tmux kill-session -t solana; \ - tmux new -s solana -d \" \ - set -x; \ - sudo rm /tmp/solana.log; \ - while : ; do \ - /snap/bin/solana.bench-tps --num-nodes $fullnode_count --seconds 600 --sustained --threads \$threadCount 2>&1 | tee -a /tmp/solana.log; \ - echo 'https://metrics.solana.com:8086/write?db=${INFLUX_DATABASE}&u=${INFLUX_USERNAME}&p=${INFLUX_PASSWORD}' \ - | xargs curl --max-time 5 -XPOST --data-binary 'testnet-deploy,name=$netBasename clientexit=1'; \ - echo Error: bench-tps should never exit | tee -a /tmp/solana.log; \ - done; \ - bash \ - \"; \ - sleep 2; \ - tmux capture-pane -t solana -p -S -100; \ - tail /tmp/solana.log; \ - " -} - -client_stop() { - declare vmName=$1 - declare vmZone=$2 - declare vmPublicIp=$3 - declare count=$4 - - touch "log-$vmName.txt" - ( - SECONDS=0 - vm_exec "$vmName" "$vmZone" "$vmPublicIp" \ - "Stopping client $vmName ($count):" \ - "\ - set -x; - tmux list-sessions; \ - tmux capture-pane -t solana -p; \ - tmux kill-session -t solana; \ - $SNAP_INSTALL_CMD; \ - sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG \ - rust-log=$RUST_LOG \ - default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \ - ; \ - " - echo "Client stopped in ${SECONDS} seconds" - ) >> "log-$vmName.txt" 2>&1 & - declare pid=$! - - # Rename log file so it can be discovered later by $pid - mv "log-$vmName.txt" "log-$pid.txt" - pids+=("$pid") -} - -fullnode_start() { - declare class=$1 - declare vmName=$2 - declare vmZone=$3 - declare vmPublicIp=$4 - declare count=$5 - - touch "log-$vmName.txt" - ( - SECONDS=0 - commonNodeConfig="\ - rust-log=$RUST_LOG \ - default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \ - metrics-config=$SOLANA_METRICS_CONFIG \ - setup-args=$SOLANA_SETUP_ARGS \ - leader-ip=$publicIp \ - " - if [[ $class = leader ]]; then - nodeConfig="mode=leader+drone $commonNodeConfig" - if [[ -n $SOLANA_CUDA ]]; then - nodeConfig="$nodeConfig enable-cuda=1" - fi - else - nodeConfig="mode=validator $commonNodeConfig" - fi - - vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \ - "\ - set -ex; \ - logmarker='solana deploy $(date)/$RANDOM'; \ - logger \"\$logmarker\"; \ - $SNAP_INSTALL_CMD; \ - sudo snap set solana $nodeConfig; \ - snap info solana; \ - sudo snap get solana; \ - echo Slight delay to get more syslog output; \ - sleep 2; \ - sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \ - " - echo "Succeeded in ${SECONDS} seconds" - ) >> "log-$vmName.txt" 2>&1 & - declare pid=$! - - # Rename log file so it can be discovered later by $pid - mv "log-$vmName.txt" "log-$pid.txt" - - pids+=("$pid") -} - -leader_start() { - fullnode_start leader "$@" -} - -validator_start() { - fullnode_start validator "$@" -} - -fullnode_stop() { - declare vmName=$1 - declare vmZone=$2 - declare vmPublicIp=$3 - declare count=$4 - - touch "log-$vmName.txt" - ( - SECONDS=0 - # Try to ping the machine first. When a machine (validator) is restarted, - # there can be a delay between when the instance is reported as RUNNING and when - # it's reachable over the network - timeout 30s bash -c "set -o pipefail; until ping -c 3 $vmPublicIp | tr - _; do echo .; done" - vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\ - if snap list solana; then \ - sudo snap set solana mode=; \ - fi" - echo "Succeeded in ${SECONDS} seconds" - ) >> "log-$vmName.txt" 2>&1 & - declare pid=$! - - # Rename log file so it can be discovered later by $pid - mv "log-$vmName.txt" "log-$pid.txt" - - pids+=("$pid") -} - -if [[ -n $LOCAL_SNAP ]]; then - echo "--- Transferring $LOCAL_SNAP to node(s)" - - transfer_local_snap() { - declare vmName=$1 - declare vmZone=$2 - declare vmPublicIp=$3 - declare vmClass=$4 - declare count=$5 - - echo "--- $vmName in zone $vmZone ($count)" - SECONDS=0 - scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$LOCAL_SNAP" testnet-deploy@"$vmPublicIp":solana_local.snap - echo "Succeeded in ${SECONDS} seconds" - } - vm_foreach transfer_local_snap -fi - -echo "--- Stopping client node(s)" -pids=() -vm_foreach_in_class client client_stop -client_stop_pids=("${pids[@]}") - -if ! $ROLLING_UPDATE; then - pids=() - echo "--- Shutting down all full nodes" - vm_foreach_in_class leader fullnode_stop - vm_foreach_in_class validator fullnode_stop - wait_for_pids fullnode shutdown -fi - -pids=() -echo --- Starting leader node -vm_foreach_in_class leader leader_start -wait_for_pids leader - -pids=() -echo --- Starting validator nodes -vm_foreach_in_class validator validator_start -wait_for_pids validators - -echo "--- $publicUrl sanity test" -if [[ -z $CI ]]; then - # TODO: ssh into a node and run testnet-sanity.sh there. It's not safe to - # assume the correct Snap is installed on the current non-CI machine - echo Skipped for non-CI deploy - snapVersion=unknown -else - ( - set -x - USE_SNAP=1 ci/testnet-sanity.sh $publicUrl $fullnode_count - ) - IFS=\ read -r _ snapVersion _ < <(snap info solana | grep "^installed:") - snapVersion=${snapVersion/0+git./} -fi - -pids=("${client_stop_pids[@]}") -wait_for_pids client shutdown -vm_foreach_in_class client client_start - -# Add "network started" datapoint -$metrics_write_datapoint "testnet-deploy,name=$netBasename start=1,version=\"$snapVersion\"" +time net/gce.sh create "${gce_create_args[@]}" +net/init-metrics.sh -e +time net/net.sh start -s "$snapChannel" exit 0 diff --git a/ci/testnet-sanity.sh b/ci/testnet-sanity.sh index 213973480..77ff1648c 100755 --- a/ci/testnet-sanity.sh +++ b/ci/testnet-sanity.sh @@ -1,78 +1,32 @@ #!/bin/bash -e -# -# Perform a quick sanity test on the specific testnet -# cd "$(dirname "$0")/.." -source scripts/configure-metrics.sh -NET_URL=$1 -if [[ -z $NET_URL ]]; then - NET_URL=testnet.solana.com -fi - -EXPECTED_NODE_COUNT=$2 -if [[ -z $EXPECTED_NODE_COUNT ]]; then - EXPECTED_NODE_COUNT=50 -fi - -echo "--- $NET_URL: verify ledger" -if [[ -z $NO_LEDGER_VERIFY ]]; then - if [[ -d /var/snap/solana/current/config/ledger ]]; then - # Note: here we assume this script is actually running on the leader node... - ( - set -x - rm -rf /var/tmp/ledger-verify - cp -r /var/snap/solana/current/config/ledger /var/tmp/ledger-verify - solana.ledger-tool --ledger /var/tmp/ledger-verify verify - ) - else - echo "^^^ +++" - echo "Ledger verify skipped" +usage() { + exitcode=0 + if [[ -n "$1" ]]; then + exitcode=1 + echo "Error: $*" fi -else - echo "^^^ +++" - echo "Ledger verify skipped (NO_LEDGER_VERIFY defined)" -fi + cat <&1 | tee validator.log - ) - wc -l validator.log - if grep -C100 panic validator.log; then - echo "^^^ +++" - echo "Panic observed" - exit 1 - else - echo "Validator log looks ok" - fi -else - echo "^^^ +++" - echo "Validator sanity disabled (NO_VALIDATOR_SANITY defined)" -fi +netName=$1 +[[ -n $netName ]] || usage "" + +set -x +net/gce.sh config -p "$netName" +net/init-metrics.sh -e +net/net.sh sanity ${NO_LEDGER_VERIFY:+-o noLedgerVerify} exit 0 diff --git a/net/gce.sh b/net/gce.sh index 1aaf0923e..1f2dbc7a3 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -215,7 +215,7 @@ create) gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" \ "$imageName" "$validatorMachineType" "$validatorBootDiskSize" "$validatorAccelerator" \ "$here/remote/remote-startup.sh" "" - if [[ -n $clientNodeCount ]]; then + if [[ $clientNodeCount -gt 0 ]]; then gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" \ "$imageName" "$clientMachineType" "$clientBootDiskSize" "$clientAccelerator" \ "$here/remote/remote-startup.sh" ""