#!/usr/bin/env bash set -e here=$(dirname "$0") SOLANA_ROOT="$(cd "$here"/..; pwd)" # shellcheck source=net/common.sh source "$here"/common.sh usage() { exitcode=0 if [[ -n "$1" ]]; then exitcode=1 echo "Error: $*" fi CLIENT_OPTIONS=$(cat << EOM -c clientType=numClients=extraArgs - Number of clientTypes to start. This options can be specified more than once. Defaults to bench-tps for all clients if not specified. Valid client types are: idle bench-tps bench-exchange User can optionally provide extraArgs that are transparently supplied to the client program as command line parameters. For example, -c bench-tps=2="--tx_count 25000" This will start 2 bench-tps clients, and supply "--tx_count 25000" to the bench-tps client. EOM ) cat < /dev/null } startBootstrapLeader() { declare ipAddress=$1 declare nodeIndex="$2" declare logFile="$3" echo "--- Starting bootstrap validator: $ipAddress" echo "start log: $logFile" # Deploy local binaries to bootstrap validator. Other validators and clients later fetch the # binaries from it ( set -x startCommon "$ipAddress" || exit 1 [[ -z "$externalPrimordialAccountsFile" ]] || rsync -vPrc -e "ssh ${sshOptions[*]}" "$externalPrimordialAccountsFile" \ "$ipAddress:$remoteExternalPrimordialAccountsFile" case $deployMethod in tar) rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/bin/* "$ipAddress:~/.cargo/bin/" rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/version.yml "$ipAddress:~/" ;; local) rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/" ssh "${sshOptions[@]}" -n "$ipAddress" "rm -f ~/version.yml; touch ~/version.yml" ;; skip) ;; *) usage "Internal error: invalid deployMethod: $deployMethod" ;; esac ssh "${sshOptions[@]}" -n "$ipAddress" \ "./solana/net/remote/remote-node.sh \ $deployMethod \ bootstrap-validator \ $entrypointIp \ $((${#validatorIpList[@]} + ${#blockstreamerIpList[@]})) \ \"$RUST_LOG\" \ $skipSetup \ $failOnValidatorBootupFailure \ \"$remoteExternalPrimordialAccountsFile\" \ \"$maybeDisableAirdrops\" \ \"$internalNodesStakeLamports\" \ \"$internalNodesLamports\" \ $nodeIndex \ ${#clientIpList[@]} \"$benchTpsExtraArgs\" \ ${#clientIpList[@]} \"$benchExchangeExtraArgs\" \ \"$genesisOptions\" \ \"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \ \"$gpuMode\" \ \"$GEOLOCATION_API_KEY\" \ " ) >> "$logFile" 2>&1 || { cat "$logFile" echo "^^^ +++" exit 1 } } startNode() { declare ipAddress=$1 declare nodeType=$2 declare nodeIndex="$3" initLogDir declare logFile="$netLogDir/validator-$ipAddress.log" if [[ -z $nodeType ]]; then echo nodeType not specified exit 1 fi if [[ -z $nodeIndex ]]; then echo nodeIndex not specified exit 1 fi echo "--- Starting $nodeType: $ipAddress" echo "start log: $logFile" ( set -x startCommon "$ipAddress" if [[ $nodeType = blockstreamer ]] && [[ -n $letsEncryptDomainName ]]; then # # Create/renew TLS certificate # declare localArchive=~/letsencrypt-"$letsEncryptDomainName".tgz if [[ -r "$localArchive" ]]; then timeout 30s scp "${sshOptions[@]}" "$localArchive" "$ipAddress:letsencrypt.tgz" fi ssh "${sshOptions[@]}" -n "$ipAddress" \ "sudo -H /certbot-restore.sh $letsEncryptDomainName maintainers@solana.com" rm -f letsencrypt.tgz timeout 30s scp "${sshOptions[@]}" "$ipAddress:/letsencrypt.tgz" letsencrypt.tgz test -s letsencrypt.tgz # Ensure non-empty before overwriting $localArchive cp letsencrypt.tgz "$localArchive" fi ssh "${sshOptions[@]}" -n "$ipAddress" \ "./solana/net/remote/remote-node.sh \ $deployMethod \ $nodeType \ $entrypointIp \ $((${#validatorIpList[@]} + ${#blockstreamerIpList[@]})) \ \"$RUST_LOG\" \ $skipSetup \ $failOnValidatorBootupFailure \ \"$remoteExternalPrimordialAccountsFile\" \ \"$maybeDisableAirdrops\" \ \"$internalNodesStakeLamports\" \ \"$internalNodesLamports\" \ $nodeIndex \ ${#clientIpList[@]} \"$benchTpsExtraArgs\" \ ${#clientIpList[@]} \"$benchExchangeExtraArgs\" \ \"$genesisOptions\" \ \"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \ \"$gpuMode\" \ \"$GEOLOCATION_API_KEY\" \ " ) >> "$logFile" 2>&1 & declare pid=$! ln -sf "validator-$ipAddress.log" "$netLogDir/validator-$pid.log" pids+=("$pid") } startClient() { declare ipAddress=$1 declare clientToRun="$2" declare clientIndex="$3" initLogDir declare logFile="$netLogDir/client-$clientToRun-$ipAddress.log" echo "--- Starting client: $ipAddress - $clientToRun" echo "start log: $logFile" ( set -x startCommon "$ipAddress" ssh "${sshOptions[@]}" -f "$ipAddress" \ "./solana/net/remote/remote-client.sh $deployMethod $entrypointIp \ $clientToRun \"$RUST_LOG\" \"$benchTpsExtraArgs\" \"$benchExchangeExtraArgs\" $clientIndex" ) >> "$logFile" 2>&1 || { cat "$logFile" echo "^^^ +++" exit 1 } } startClients() { for ((i=0; i < "$numClients" && i < "$numClientsRequested"; i++)) do if [[ $i -lt "$numBenchTpsClients" ]]; then startClient "${clientIpList[$i]}" "solana-bench-tps" "$i" elif [[ $i -lt $((numBenchTpsClients + numBenchExchangeClients)) ]]; then startClient "${clientIpList[$i]}" "solana-bench-exchange" $((i-numBenchTpsClients)) else startClient "${clientIpList[$i]}" "idle" fi done } sanity() { declare skipBlockstreamerSanity=$1 $metricsWriteDatapoint "testnet-deploy net-sanity-begin=1" declare ok=true declare bootstrapLeader=${validatorIpList[0]} declare blockstreamer=${blockstreamerIpList[0]} annotateBlockexplorerUrl echo "--- Sanity: $bootstrapLeader" ( set -x # shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally ssh "${sshOptions[@]}" "$bootstrapLeader" \ "./solana/net/remote/remote-sanity.sh $bootstrapLeader $sanityExtraArgs \"$RUST_LOG\"" ) || ok=false $ok || exit 1 if [[ -z $skipBlockstreamerSanity && -n $blockstreamer ]]; then # If there's a blockstreamer node run a reduced sanity check on it as well echo "--- Sanity: $blockstreamer" ( set -x # shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally ssh "${sshOptions[@]}" "$blockstreamer" \ "./solana/net/remote/remote-sanity.sh $blockstreamer $sanityExtraArgs \"$RUST_LOG\"" ) || ok=false $ok || exit 1 fi $metricsWriteDatapoint "testnet-deploy net-sanity-complete=1" } deployUpdate() { if [[ -z $updatePlatforms ]]; then echo "No update platforms" return fi if [[ -z $releaseChannel ]]; then echo "Release channel not specified (use -t option)" exit 1 fi declare ok=true declare bootstrapLeader=${validatorIpList[0]} for updatePlatform in $updatePlatforms; do echo "--- Deploying solana-install update: $updatePlatform" ( set -x scripts/solana-install-update-manifest-keypair.sh "$updatePlatform" timeout 30s scp "${sshOptions[@]}" \ update_manifest_keypair.json "$bootstrapLeader:solana/update_manifest_keypair.json" # shellcheck disable=SC2029 # remote-deploy-update.sh args are expanded on client side intentionally ssh "${sshOptions[@]}" "$bootstrapLeader" \ "./solana/net/remote/remote-deploy-update.sh $releaseChannel $updatePlatform" ) || ok=false $ok || exit 1 done } getNodeType() { echo "getNodeType: $nodeAddress" [[ -n $nodeAddress ]] || { echo "Error: nodeAddress not set" exit 1 } nodeIndex=0 # <-- global nodeType=validator # <-- global for ipAddress in "${validatorIpList[@]}" b "${blockstreamerIpList[@]}"; do if [[ $ipAddress = b ]]; then nodeType=blockstreamer continue fi if [[ $ipAddress = "$nodeAddress" ]]; then echo "getNodeType: $nodeType ($nodeIndex)" return fi ((nodeIndex = nodeIndex + 1)) done echo "Error: Unknown node: $nodeAddress" exit 1 } prepareDeploy() { case $deployMethod in tar) if [[ -n $releaseChannel ]]; then rm -f "$SOLANA_ROOT"/solana-release.tar.bz2 declare updateDownloadUrl=http://release.solana.com/"$releaseChannel"/solana-release-x86_64-unknown-linux-gnu.tar.bz2 ( set -x curl --retry 5 --retry-delay 2 --retry-connrefused \ -o "$SOLANA_ROOT"/solana-release.tar.bz2 "$updateDownloadUrl" ) tarballFilename="$SOLANA_ROOT"/solana-release.tar.bz2 fi ( set -x rm -rf "$SOLANA_ROOT"/solana-release (cd "$SOLANA_ROOT"; tar jxv) < "$tarballFilename" cat "$SOLANA_ROOT"/solana-release/version.yml ) ;; local) if $doBuild; then build else echo "Build skipped due to --no-build" fi ;; skip) ;; *) usage "Internal error: invalid deployMethod: $deployMethod" ;; esac if [[ -n $deployIfNewer ]]; then if [[ $deployMethod != tar ]]; then echo "Error: --deploy-if-newer only supported for tar deployments" exit 1 fi echo "Fetching current software version" ( set -x rsync -vPrc -e "ssh ${sshOptions[*]}" "${validatorIpList[0]}":~/version.yml current-version.yml ) cat current-version.yml if ! diff -q current-version.yml "$SOLANA_ROOT"/solana-release/version.yml; then echo "Cluster software version is old. Update required" else echo "Cluster software version is current. No update required" exit 0 fi fi } deploy() { initLogDir echo "Deployment started at $(date)" $metricsWriteDatapoint "testnet-deploy net-start-begin=1" declare bootstrapLeader=true for nodeAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}"; do nodeType= nodeIndex= getNodeType if $bootstrapLeader; then SECONDS=0 declare bootstrapNodeDeployTime= startBootstrapLeader "$nodeAddress" $nodeIndex "$netLogDir/bootstrap-validator-$ipAddress.log" bootstrapNodeDeployTime=$SECONDS $metricsWriteDatapoint "testnet-deploy net-bootnode-leader-started=1" bootstrapLeader=false SECONDS=0 pids=() else startNode "$ipAddress" $nodeType $nodeIndex # Stagger additional node start time. If too many nodes start simultaneously # the bootstrap node gets more rsync requests from the additional nodes than # it can handle. sleep 2 fi done for pid in "${pids[@]}"; do declare ok=true wait "$pid" || ok=false if ! $ok; then echo "+++ validator failed to start" cat "$netLogDir/validator-$pid.log" if $failOnValidatorBootupFailure; then exit 1 else echo "Failure is non-fatal" fi fi done $metricsWriteDatapoint "testnet-deploy net-validators-started=1" additionalNodeDeployTime=$SECONDS annotateBlockexplorerUrl sanity skipBlockstreamerSanity # skip sanity on blockstreamer node, it may not # have caught up to the bootstrap validator yet echo "--- Sleeping $clientDelayStart seconds after validators are started before starting clients" sleep "$clientDelayStart" SECONDS=0 startClients clientDeployTime=$SECONDS $metricsWriteDatapoint "testnet-deploy net-start-complete=1" declare networkVersion=unknown case $deployMethod in tar) networkVersion="$( ( set -o pipefail grep "^commit: " "$SOLANA_ROOT"/solana-release/version.yml | head -n1 | cut -d\ -f2 ) || echo "tar-unknown" )" ;; local) networkVersion="$(git rev-parse HEAD || echo local-unknown)" ;; skip) ;; *) usage "Internal error: invalid deployMethod: $deployMethod" ;; esac $metricsWriteDatapoint "testnet-deploy version=\"${networkVersion:0:9}\"" echo echo "--- Deployment Successful" echo "Bootstrap validator deployment took $bootstrapNodeDeployTime seconds" echo "Additional validator deployment (${#validatorIpList[@]} validators, ${#blockstreamerIpList[@]} blockstreamer nodes) took $additionalNodeDeployTime seconds" echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds" echo "Network start logs in $netLogDir" } stopNode() { local ipAddress=$1 local block=$2 initLogDir declare logFile="$netLogDir/stop-validator-$ipAddress.log" echo "--- Stopping node: $ipAddress" echo "stop log: $logFile" syncScripts "$ipAddress" ( # Since cleanup.sh does a pkill, we cannot pass the command directly, # otherwise the process which is doing the killing will be killed because # the script itself will match the pkill pattern set -x # shellcheck disable=SC2029 # It's desired that PS4 be expanded on the client side ssh "${sshOptions[@]}" "$ipAddress" "PS4=\"$PS4\" ./solana/net/remote/cleanup.sh" ) >> "$logFile" 2>&1 & declare pid=$! ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log" if $block; then wait $pid else pids+=("$pid") fi } stop() { SECONDS=0 $metricsWriteDatapoint "testnet-deploy net-stop-begin=1" declare loopCount=0 pids=() for ipAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${clientIpList[@]}"; do stopNode "$ipAddress" false # Stagger additional node stop time to avoid too many concurrent ssh # sessions ((loopCount++ % 4 == 0)) && sleep 2 done echo --- Waiting for nodes to finish stopping for pid in "${pids[@]}"; do echo -n "$pid " wait "$pid" || true done echo $metricsWriteDatapoint "testnet-deploy net-stop-complete=1" echo "Stopping nodes took $SECONDS seconds" } checkPremptibleInstances() { # The validatorIpList nodes may be preemptible instances that can disappear at # any time. Try to detect when a validator has been preempted to help the user # out. # # Of course this isn't airtight as an instance could always disappear # immediately after its successfully pinged. for ipAddress in "${validatorIpList[@]}"; do ( set -x timeout 5s ping -c 1 "$ipAddress" | tr - _ ) || { cat < solana/netem.cfg; solana/scripts/netem.sh add \"$netemConfig\"" done fi ;; *) echo "Internal error: Unknown command: $command" usage exit 1 esac