diff --git a/ci/testnet-deploy.sh b/ci/testnet-deploy.sh index f5e3952cf4..d6e9267f98 100755 --- a/ci/testnet-deploy.sh +++ b/ci/testnet-deploy.sh @@ -51,120 +51,144 @@ echo "Network entrypoint URL: $publicUrl ($publicIp)" echo "Snap channel: $SOLANA_SNAP_CHANNEL" leaderName=${publicUrl//./-} -vmlist=() + +vmlist=() # Each array element is the triple "class:vmName:vmZone" + +# +# vm_foreach_in_class [class] [cmd] +# where +# class - the desired VM class to operate on +# cmd - the command to execute on each VM in the desired class. +# The command will receive three arguments: +# vmName - GCP name of the VM +# vmZone - The GCP zone the VM is located in +# count - Monotonically increasing count for each +# invocation of cmd, starting at 1 +# +# +vm_foreach_in_class() { + declare class=$1 + declare cmd=$2 + + declare count=1 + for info in "${vmlist[@]}"; do + declare vmClass vmName vmZone + IFS=: read -r vmClass vmName vmZone < <(echo "$info") + + if [[ $class = "$vmClass" ]]; then + eval "$cmd" "$vmName" "$vmZone" "$count" + count=$((count + 1)) + fi + done +} + findVms() { - declare filter="$1" + declare class="$1" + declare filter="$2" gcloud compute instances list --filter="$filter" while read -r vmName vmZone status; do if [[ $status != RUNNING ]]; then echo "Warning: $vmName is not RUNNING, ignoring it." continue fi - vmlist+=("$vmName:$vmZone") + vmlist+=("$class:$vmName:$vmZone") done < <(gcloud compute instances list --filter="$filter" --format 'value(name,zone,status)') } -wait_for_node() { - declare pid=$1 - - declare ok=true - wait "$pid" || ok=false - cat "log-$pid.txt" - if ! $ok; then - echo ^^^ +++ - exit 1 - fi -} - echo "Leader node:" -findVms "name=$leaderName" +findVms leader "name=$leaderName" [[ ${#vmlist[@]} = 1 ]] || { echo "Unable to find $leaderName" exit 1 } -echo "Client node:" -findVms "name=$leaderName-client" -clientVm= -if [[ ${#vmlist[@]} = 2 ]]; then - clientVm=${vmlist[1]} - unset 'vmlist[1]' -fi +echo "Client node(s):" +findVms client "name~^$leaderName-client" echo "Validator nodes:" -findVms "name~^$leaderName-validator-" - -if ! $ROLLING_UPDATE; then - count=1 - for info in "${vmlist[@]}"; do - nodePosition="($count/${#vmlist[*]})" - vmName=${info%:*} - vmZone=${info#*:} - echo "--- Shutting down $vmName in zone $vmZone $nodePosition" - gcloud compute ssh "$vmName" --zone "$vmZone" \ - --ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \ - --command="sudo snap remove solana" & - - if [[ $((count % 5)) = 0 ]]; then - # Slow down deployment to avoid triggering GCP login - # quota limits (each |ssh| counts as a login) - sleep 3 - fi - - count=$((count + 1)) - done - - wait -fi +findVms validator "name~^$leaderName-validator-" # Add "network stopping" datapoint netName=${SOLANA_NET_URL%testnet.solana.com} netName=${netName:0:8} ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" stop=1" -client_run() { - declare message=$1 - declare cmd=$2 - [[ -n $clientVm ]] || return 0; - vmName=${clientVm%:*} - vmZone=${clientVm#*:} +gcp_vm_exec() { + declare vmName=$1 + declare vmZone=$2 + declare message=$3 + declare cmd=$4 + echo "--- $message $vmName in zone $vmZone" gcloud compute ssh "$vmName" --zone "$vmZone" \ --ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \ --command="$cmd" } -client_run \ - "Shutting down" \ - "\ - set -x; - tmux list-sessions; \ - tmux capture-pane -t solana -p; \ - tmux kill-session -t solana; \ - sudo snap remove solana; \ +gcp_login_quota_workaround() { + declare count=$1 + + if [[ $((count % 5)) = 0 ]]; then + # Slow down deployment to avoid triggering GCP login + # quota limits (each |ssh| counts as a login) + sleep 3 + fi +} + +client_start() { + declare vmName=$1 + declare vmZone=$2 + declare count=$3 + + gcp_vm_exec "$vmName" "$vmZone" \ + "Starting client $count:" \ + "\ + set -x; + sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \ + sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG; \ + snap info solana; \ + tmux new -s solana -d \" \ + /snap/bin/solana.bench-tps $SOLANA_NET_URL ${#vmlist[@]} --loop 2>&1 | tee /tmp/solana.log; \ + echo Error: bench-tps should never exit; \ + bash \ + \"; \ + sleep 2; \ + tmux capture-pane -t solana -p -S -100; \ + tail /tmp/solana.log; \ " +} -echo "--- Refreshing leader" -leader=true -pids=() -count=1 -for info in "${vmlist[@]}"; do - nodePosition="($count/${#vmlist[*]})" +client_stop() { + declare vmName=$1 + declare vmZone=$2 + declare count=$3 - vmName=${info%:*} - vmZone=${info#*:} - echo "Starting refresh for $vmName $nodePosition" + gcp_vm_exec "$vmName" "$vmZone" \ + "Stopping client $count:" \ + "\ + set -x; + tmux list-sessions; \ + tmux capture-pane -t solana -p; \ + tmux kill-session -t solana; \ + sudo snap remove solana; \ + " +} + +fullnode_start() { + declare class=$1 + declare vmName=$2 + declare vmZone=$3 + declare count=$4 ( SECONDS=0 - echo "--- $vmName in zone $vmZone $nodePosition" commonNodeConfig="\ rust-log=$RUST_LOG \ default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \ metrics-config=$SOLANA_METRICS_CONFIG \ " - if $leader; then + if [[ $class = leader ]]; then nodeConfig="mode=leader+drone $commonNodeConfig" if [[ -n $SOLANA_CUDA ]]; then nodeConfig="$nodeConfig enable-cuda=1" @@ -174,9 +198,8 @@ for info in "${vmlist[@]}"; do fi set -x - gcloud compute ssh "$vmName" --zone "$vmZone" \ - --ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -t" \ - --command="\ + gcp_vm_exec "$vmName" "$vmZone" "Starting $class $count:" \ + "\ set -ex; \ logmarker='solana deploy $(date)/$RANDOM'; \ sudo snap remove solana; \ @@ -190,55 +213,96 @@ for info in "${vmlist[@]}"; do " echo "Succeeded in ${SECONDS} seconds" ) > "log-$vmName.txt" 2>&1 & - pid=$! + declare pid=$! + # Rename log file so it can be discovered later by $pid + while [[ ! -f "log-$vmName.txt" ]]; do + sleep 1 + done mv "log-$vmName.txt" "log-$pid.txt" - if $leader; then - echo Waiting for leader... - # Wait for the leader to initialize before starting the validators - # TODO: Remove this limitation eventually. - wait_for_node "$pid" + gcp_login_quota_workaround "$count" + pids+=("$pid") +} - echo "--- Refreshing validators" - else - # Slow down deployment to ~20 machines a minute to avoid triggering GCP login - # quota limits (each |ssh| counts as a login) - sleep 3 +leader_start() { + fullnode_start leader "$@" +} - pids+=("$pid") - fi - leader=false - count=$((count + 1)) -done +validator_start() { + fullnode_start validator "$@" +} -echo --- Waiting for validators -for pid in "${pids[@]}"; do - wait_for_node "$pid" -done +fullnode_stop() { + declare vmName=$1 + declare vmZone=$2 + declare count=$3 + + ( + SECONDS=0 + gcp_vm_exec "$vmName" "$vmZone" "Shutting down" "sudo snap remove solana" + echo "Succeeded in ${SECONDS} seconds" + ) > "log-$vmName.txt" 2>&1 & + declare pid=$! + + # Rename log file so it can be discovered later by $pid + while [[ ! -f "log-$vmName.txt" ]]; do + sleep 1 + done + mv "log-$vmName.txt" "log-$pid.txt" + + gcp_login_quota_workaround "$count" + pids+=("$pid") +} + +wait_for_pids() { + echo "--- Waiting for $*" + for pid in "${pids[@]}"; do + declare ok=true + wait "$pid" || ok=false + cat "log-$pid.txt" + if ! $ok; then + echo ^^^ +++ + exit 1 + fi + done +} + + +vm_foreach_in_class client client_stop + +if ! $ROLLING_UPDATE; then + pids=() + echo "--- Shutting down all full nodes" + vm_foreach_in_class leader fullnode_stop + vm_foreach_in_class validator fullnode_stop + wait_for_pids fullnode shutdown +fi + +pids=() +echo --- Starting leader node +vm_foreach_in_class leader leader_start +wait_for_pids leader + +pids=() +echo --- Starting validator nodes +vm_foreach_in_class validator validator_start +wait_for_pids validators echo "--- $publicUrl sanity test" ( + fullnode_count=0 + inc_fullnode_count() { + fullnode_count=$((fullnode_count + 1)) + } + vm_foreach_in_class leader inc_fullnode_count + vm_foreach_in_class validator inc_fullnode_count + set -x - USE_SNAP=1 ci/testnet-sanity.sh $publicUrl ${#vmlist[@]} + USE_SNAP=1 ci/testnet-sanity.sh $publicUrl $fullnode_count ) -client_run \ - "Starting client on " \ - "\ - set -x; - sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \ - sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG; \ - snap info solana; \ - tmux new -s solana -d \" \ - /snap/bin/solana.bench-tps $SOLANA_NET_URL ${#vmlist[@]} --loop 2>&1 | tee /tmp/solana.log; \ - echo Error: bench-tps should never exit; \ - bash \ - \"; \ - sleep 2; \ - tmux capture-pane -t solana -p -S -100; \ - tail /tmp/solana.log; \ - " +vm_foreach_in_class client client_start # Add "network started" datapoint ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" start=1"