solana/net/net.sh

#!/usr/bin/env bash
set -e

here=$(dirname "$0")
SOLANA_ROOT="$(cd "$here"/..; pwd)"

# shellcheck source=net/common.sh
source "$here"/common.sh

usage() {
  exitcode=0
  if [[ -n "$1" ]]; then
    exitcode=1
    echo "Error: $*"
  fi
  cat <<EOF
usage: $0 [start|stop|restart|sanity] [command-specific options]

Operate a configured testnet

 start    - Start the network
 sanity   - Sanity check the network
 stop     - Stop the network
 restart  - Shortcut for stop then start
 logs     - Fetch remote logs from each network node
 startnode- Start an individual node (previously stopped with stopNode)
 stopnode - Stop an individual node
 update   - Deploy a new software update to the cluster

 start-specific options:
   -T [tarFilename]                   - Deploy the specified release tarball
   -t edge|beta|stable|vX.Y.Z         - Deploy the latest tarball release for the
                                        specified release channel (edge|beta|stable) or release tag
                                        (vX.Y.Z)
   -r / --skip-setup                  - Reuse existing node/ledger configuration from a
                                        previous |start| (ie, don't run ./multinode-demo/setup.sh).
   -d / --debug                       - Build/deploy the testnet with debug binaries
   -c clientType=numClients=extraArgs - Number of clientTypes to start.  This options can be specified
                                        more than once.  Defaults to bench-tps for all clients if not
                                        specified.
                                        Valid client types are:
                                            idle
                                            bench-tps
                                            bench-exchange
                                        User can optionally provide extraArgs that are transparently
                                        supplied to the client program as command line parameters.
                                        For example,
                                            -c bench-tps=2="--tx_count 25000"
                                        This will start 2 bench-tps clients, and supply "--tx_count 25000"
                                        to the bench-tps client.
   -n NUM_VALIDATORS                  - Number of validators to apply command to.
   --gpu-mode GPU_MODE                - Specify GPU mode to launch validators with (default: $gpuMode).
                                        MODE must be one of
                                          on - GPU *required*, any vendor *
                                          off - No GPU, CPU-only
                                          auto - Use GPU if available, any vendor *
                                          cuda - GPU *required*, Nvidia CUDA only
                                          *  Currently, Nvidia CUDA is the only supported GPU vendor
   --hashes-per-tick NUM_HASHES|sleep|auto
                                      - Override the default --hashes-per-tick for the cluster
   --no-airdrop
                                      - If set, disables airdrops.  Nodes must be funded in genesis config when airdrops are disabled.
   --faucet-lamports NUM_LAMPORTS_TO_MINT
                                      - Override the default 500000000000000000 lamports minted in genesis
   --internal-nodes-stake-lamports NUM_LAMPORTS_PER_NODE
                                      - Amount to stake internal nodes.
   --internal-nodes-lamports NUM_LAMPORTS_PER_NODE
                                      - Amount to fund internal nodes in genesis config.
   --external-accounts-file FILE_PATH
                                      - A YML file with a list of account pubkeys and corresponding lamport balances
                                        in genesis config for external nodes
   --no-snapshot-fetch
                                      - If set, disables booting validators from a snapshot
   --skip-poh-verify
                                      - If set, validators will skip verifying
                                        the ledger they already have saved to disk at
                                        boot (results in a much faster boot)
   --no-deploy
                                      - Don't deploy new software, use the
                                        existing deployment
   --no-build
                                      - Don't build new software, deploy the
                                        existing binaries

   --deploy-if-newer                  - Only deploy if newer software is
                                        available (requires -t or -T)

   --use-move                         - Build the move-loader-program and add it to the cluster

   --operating-mode development|softlaunch
                                      - Specify whether or not to launch the cluster in "development" mode with all features enabled at epoch 0,
                                        or "softlaunch" mode with some features disabled at epoch 0 (default: development)

 sanity/start-specific options:
   -F                   - Discard validator nodes that didn't bootup successfully
   -o noInstallCheck    - Skip solana-install sanity
   -o rejectExtraNodes  - Require the exact number of nodes

 stop-specific options:
   none

 logs-specific options:
   none

 netem-specific options:
   --config            - Netem configuration (as a double quoted string)
   --parition          - Percentage of network that should be configured with netem
   --config-file       - Configuration file for partition and netem configuration
   --netem-cmd         - Optional command argument to netem. Default is "add". Use "cleanup" to remove rules.

 update-specific options:
   --platform linux|osx|windows       - Deploy the tarball using 'solana-install deploy ...' for the
                                        given platform (multiple platforms may be specified)
                                        (-t option must be supplied as well)

 startnode/stopnode-specific options:
   -i [ip address]                    - IP Address of the node to start or stop

Note: if RUST_LOG is set in the environment it will be propogated into the
      network nodes.
EOF
  exit $exitcode
}

releaseChannel=
deployMethod=local
deployIfNewer=
sanityExtraArgs=
skipSetup=false
updatePlatforms=
nodeAddress=
numIdleClients=0
numBenchTpsClients=0
numBenchExchangeClients=0
benchTpsExtraArgs=
benchExchangeExtraArgs=
failOnValidatorBootupFailure=true
genesisOptions=
numValidatorsRequested=
externalPrimordialAccountsFile=
remoteExternalPrimordialAccountsFile=
internalNodesStakeLamports=
internalNodesLamports=
maybeNoSnapshot=""
maybeLimitLedgerSize=""
maybeSkipLedgerVerify=""
maybeDisableAirdrops=""
debugBuild=false
doBuild=true
gpuMode=auto
maybeUseMove=""
netemPartition=""
netemConfig=""
netemConfigFile=""
netemCommand="add"

command=$1
[[ -n $command ]] || usage
shift

shortArgs=()
while [[ -n $1 ]]; do
  if [[ ${1:0:2} = -- ]]; then
    if [[ $1 = --hashes-per-tick ]]; then
      genesisOptions="$genesisOptions $1 $2"
      shift 2
    elif [[ $1 = --slots-per-epoch ]]; then
      genesisOptions="$genesisOptions $1 $2"
      shift 2
    elif [[ $1 = --target-lamports-per-signature ]]; then
      genesisOptions="$genesisOptions $1 $2"
      shift 2
    elif [[ $1 = --faucet-lamports ]]; then
      genesisOptions="$genesisOptions $1 $2"
      shift 2
    elif [[ $1 = --operating-mode ]]; then
      case "$2" in
        development|softlaunch)
          ;;
        *)
          echo "Unexpected operating mode: \"$2\""
          exit 1
          ;;
      esac
      genesisOptions="$genesisOptions $1 $2"
      shift 2
    elif [[ $1 = --no-snapshot-fetch ]]; then
      maybeNoSnapshot="$1"
      shift 1
    elif [[ $1 = --deploy-if-newer ]]; then
      deployIfNewer=1
      shift 1
    elif [[ $1 = --no-deploy ]]; then
      deployMethod=skip
      shift 1
    elif [[ $1 = --no-build ]]; then
      doBuild=false
      shift 1
    elif [[ $1 = --limit-ledger-size ]]; then
      maybeLimitLedgerSize="$1"
      shift 1
    elif [[ $1 = --skip-poh-verify ]]; then
      maybeSkipLedgerVerify="$1"
      shift 1
    elif [[ $1 = --skip-setup ]]; then
      skipSetup=true
      shift 1
    elif [[ $1 = --platform ]]; then
      updatePlatforms="$updatePlatforms $2"
      shift 2
    elif [[ $1 = --internal-nodes-stake-lamports ]]; then
      internalNodesStakeLamports="$2"
      shift 2
    elif [[ $1 = --internal-nodes-lamports ]]; then
      internalNodesLamports="$2"
      shift 2
    elif [[ $1 = --external-accounts-file ]]; then
      externalPrimordialAccountsFile="$2"
      remoteExternalPrimordialAccountsFile=/tmp/external-primordial-accounts.yml
      shift 2
    elif [[ $1 = --no-airdrop ]]; then
      maybeDisableAirdrops="$1"
      shift 1
    elif [[ $1 = --debug ]]; then
      debugBuild=true
      shift 1
    elif [[ $1 = --use-move ]]; then
      maybeUseMove=$1
      shift 1
    elif [[ $1 = --partition ]]; then
      netemPartition=$2
      shift 2
    elif [[ $1 = --config ]]; then
      netemConfig=$2
      shift 2
    elif [[ $1 == --config-file ]]; then
      netemConfigFile=$2
      shift 2
    elif [[ $1 == --netem-cmd ]]; then
      netemCommand=$2
      shift 2
    elif [[ $1 = --gpu-mode ]]; then
      gpuMode=$2
      case "$gpuMode" in
        on|off|auto|cuda)
          ;;
        *)
          echo "Unexpected GPU mode: \"$gpuMode\""
          exit 1
          ;;
      esac
      shift 2
    else
      usage "Unknown long option: $1"
    fi
  else
    shortArgs+=("$1")
    shift
  fi
done

while getopts "h?T:t:o:f:rc:Fn:i:d" opt "${shortArgs[@]}"; do
  case $opt in
  h | \?)
    usage
    ;;
  T)
    tarballFilename=$OPTARG
    [[ -r $tarballFilename ]] || usage "File not readable: $tarballFilename"
    deployMethod=tar
    ;;
  t)
    case $OPTARG in
    edge|beta|stable|v*)
      releaseChannel=$OPTARG
      deployMethod=tar
      ;;
    *)
      usage "Invalid release channel: $OPTARG"
      ;;
    esac
    ;;
  n)
    numValidatorsRequested=$OPTARG
    ;;
  r)
    skipSetup=true
    ;;
  o)
    case $OPTARG in
    rejectExtraNodes|noInstallCheck)
      sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
      ;;
    *)
      usage "Unknown option: $OPTARG"
      ;;
    esac
    ;;
  c)
    getClientTypeAndNum() {
      if ! [[ $OPTARG == *'='* ]]; then
        echo "Error: Expecting tuple \"clientType=numClientType=extraArgs\" but got \"$OPTARG\""
        exit 1
      fi
      local keyValue
      IFS='=' read -ra keyValue <<< "$OPTARG"
      local clientType=${keyValue[0]}
      local numClients=${keyValue[1]}
      local extraArgs=${keyValue[2]}
      re='^[0-9]+$'
      if ! [[ $numClients =~ $re ]] ; then
        echo "error: numClientType must be a number but got \"$numClients\""
        exit 1
      fi
      case $clientType in
        idle)
          numIdleClients=$numClients
          # $extraArgs ignored for 'idle'
        ;;
        bench-tps)
          numBenchTpsClients=$numClients
          benchTpsExtraArgs=$extraArgs
        ;;
        bench-exchange)
          numBenchExchangeClients=$numClients
          benchExchangeExtraArgs=$extraArgs
        ;;
        *)
          echo "Unknown client type: $clientType"
          exit 1
          ;;
      esac
    }
    getClientTypeAndNum
    ;;
  F)
    failOnValidatorBootupFailure=false
    ;;
  i)
    nodeAddress=$OPTARG
    ;;
  d)
    debugBuild=true
    ;;
  *)
    usage "Error: unhandled option: $opt"
    ;;
  esac
done

loadConfigFile

netLogDir=
initLogDir() { # Initializes the netLogDir global variable.  Idempotent
  [[ -z $netLogDir ]] || return 0

  netLogDir="$netDir"/log
  declare netLogDateDir
  netLogDateDir="$netDir"/log-$(date +"%Y-%m-%d_%H_%M_%S")
  if [[ -d $netLogDir && ! -L $netLogDir ]]; then
    echo "Warning: moving $netLogDir to make way for symlink."
    mv "$netLogDir" "$netDir"/log.old
  elif [[ -L $netLogDir ]]; then
    rm "$netLogDir"
  fi
  mkdir -p "$netConfigDir" "$netLogDateDir"
  ln -sf "$netLogDateDir" "$netLogDir"
  echo "Log directory: $netLogDateDir"
}

if [[ -n $numValidatorsRequested ]]; then
  truncatedNodeList=( "${validatorIpList[@]:0:$numValidatorsRequested}" )
  unset validatorIpList
  validatorIpList=( "${truncatedNodeList[@]}" )
fi

numClients=${#clientIpList[@]}
numClientsRequested=$((numBenchTpsClients + numBenchExchangeClients + numIdleClients))
if [[ "$numClientsRequested" -eq 0 ]]; then
  numBenchTpsClients=$numClients
  numClientsRequested=$numClients
else
  if [[ "$numClientsRequested" -gt "$numClients" ]]; then
    echo "Error: More clients requested ($numClientsRequested) then available ($numClients)"
    exit 1
  fi
fi

annotate() {
  [[ -z $BUILDKITE ]] || {
    buildkite-agent annotate "$@"
  }
}

annotateBlockexplorerUrl() {
  declare blockstreamer=${blockstreamerIpList[0]}

  if [[ -n $blockstreamer ]]; then
    annotate --style info --context blockexplorer-url "Block explorer: http://$blockstreamer/"
  fi
}

build() {
  supported=("18.04")
  declare MAYBE_DOCKER=
  if [[ $(uname) != Linux || ! " ${supported[*]} " =~ $(lsb_release -sr) ]]; then
    # shellcheck source=ci/rust-version.sh
    source "$SOLANA_ROOT"/ci/rust-version.sh
    MAYBE_DOCKER="ci/docker-run.sh $rust_stable_docker_image"
  fi
  SECONDS=0
  (
    cd "$SOLANA_ROOT"
    echo "--- Build started at $(date)"

    set -x
    rm -rf farf

    buildVariant=
    if $debugBuild; then
      buildVariant=debug
    fi

    $MAYBE_DOCKER bash -c "
      set -ex
      scripts/cargo-install-all.sh farf \"$buildVariant\" \"$maybeUseMove\"
    "
  )
  echo "Build took $SECONDS seconds"
}

startCommon() {
  declare ipAddress=$1
  test -d "$SOLANA_ROOT"
  if $skipSetup; then
    ssh "${sshOptions[@]}" "$ipAddress" "
      set -x;
      mkdir -p ~/solana/config;
      rm -rf ~/config;
      mv ~/solana/config ~;
      rm -rf ~/solana;
      mkdir -p ~/solana ~/.cargo/bin;
      mv ~/config ~/solana/
    "
  else
    ssh "${sshOptions[@]}" "$ipAddress" "
      set -x;
      rm -rf ~/solana;
      mkdir -p ~/.cargo/bin
    "
  fi
  [[ -z "$externalNodeSshKey" ]] || ssh-copy-id -f -i "$externalNodeSshKey" "${sshOptions[@]}" "solana@$ipAddress"
  rsync -vPrc -e "ssh ${sshOptions[*]}" \
    --exclude 'net/log*' \
    "$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
    "$ipAddress":~/solana/
}

startBootstrapLeader() {
  declare ipAddress=$1
  declare nodeIndex="$2"
  declare logFile="$3"
  echo "--- Starting bootstrap leader: $ipAddress"
  echo "start log: $logFile"

  # Deploy local binaries to bootstrap validator.  Other validators and clients later fetch the
  # binaries from it
  (
    set -x
    startCommon "$ipAddress" || exit 1
    [[ -z "$externalPrimordialAccountsFile" ]] || rsync -vPrc -e "ssh ${sshOptions[*]}" "$externalPrimordialAccountsFile" \
      "$ipAddress:$remoteExternalPrimordialAccountsFile"
    case $deployMethod in
    tar)
      rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/bin/* "$ipAddress:~/.cargo/bin/"
      rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/version.yml "$ipAddress:~/"
      ;;
    local)
      rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
      ssh "${sshOptions[@]}" -n "$ipAddress" "rm -f ~/version.yml; touch ~/version.yml"
      ;;
    skip)
      ;;
    *)
      usage "Internal error: invalid deployMethod: $deployMethod"
      ;;
    esac

    ssh "${sshOptions[@]}" -n "$ipAddress" \
      "./solana/net/remote/remote-node.sh \
         $deployMethod \
         bootstrap-leader \
         $entrypointIp \
         $((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
         \"$RUST_LOG\" \
         $skipSetup \
         $failOnValidatorBootupFailure \
         \"$remoteExternalPrimordialAccountsFile\" \
         \"$maybeDisableAirdrops\" \
         \"$internalNodesStakeLamports\" \
         \"$internalNodesLamports\" \
         $nodeIndex \
         $numBenchTpsClients \"$benchTpsExtraArgs\" \
         $numBenchExchangeClients \"$benchExchangeExtraArgs\" \
         \"$genesisOptions\" \
         \"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
         \"$gpuMode\" \
         \"$GEOLOCATION_API_KEY\" \
      "

  ) >> "$logFile" 2>&1 || {
    cat "$logFile"
    echo "^^^ +++"
    exit 1
  }
}

startNode() {
  declare ipAddress=$1
  declare nodeType=$2
  declare nodeIndex="$3"

  initLogDir
  declare logFile="$netLogDir/validator-$ipAddress.log"

  if [[ -z $nodeType ]]; then
    echo nodeType not specified
    exit 1
  fi

  if [[ -z $nodeIndex ]]; then
    echo nodeIndex not specified
    exit 1
  fi

  echo "--- Starting $nodeType: $ipAddress"
  echo "start log: $logFile"
  (
    set -x
    startCommon "$ipAddress"

    if [[ $nodeType = blockstreamer ]] && [[ -n $letsEncryptDomainName ]]; then
      #
      # Create/renew TLS certificate
      #
      declare localArchive=~/letsencrypt-"$letsEncryptDomainName".tgz
      if [[ -r "$localArchive" ]]; then
        timeout 30s scp "${sshOptions[@]}" "$localArchive" "$ipAddress:letsencrypt.tgz"
      fi
      ssh "${sshOptions[@]}" -n "$ipAddress" \
        "sudo -H /certbot-restore.sh $letsEncryptDomainName maintainers@solana.com"
      rm -f letsencrypt.tgz
      timeout 30s scp "${sshOptions[@]}" "$ipAddress:/letsencrypt.tgz" letsencrypt.tgz
      test -s letsencrypt.tgz # Ensure non-empty before overwriting $localArchive
      cp letsencrypt.tgz "$localArchive"
    fi

    ssh "${sshOptions[@]}" -n "$ipAddress" \
      "./solana/net/remote/remote-node.sh \
         $deployMethod \
         $nodeType \
         $entrypointIp \
         $((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
         \"$RUST_LOG\" \
         $skipSetup \
         $failOnValidatorBootupFailure \
         \"$remoteExternalPrimordialAccountsFile\" \
         \"$maybeDisableAirdrops\" \
         \"$internalNodesStakeLamports\" \
         \"$internalNodesLamports\" \
         $nodeIndex \
         $numBenchTpsClients \"$benchTpsExtraArgs\" \
         $numBenchExchangeClients \"$benchExchangeExtraArgs\" \
         \"$genesisOptions\" \
         \"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
         \"$gpuMode\" \
         \"$GEOLOCATION_API_KEY\" \
      "
  ) >> "$logFile" 2>&1 &
  declare pid=$!
  ln -sf "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
  pids+=("$pid")
}

startClient() {
  declare ipAddress=$1
  declare clientToRun="$2"
  declare clientIndex="$3"

  initLogDir
  declare logFile="$netLogDir/client-$clientToRun-$ipAddress.log"

  echo "--- Starting client: $ipAddress - $clientToRun"
  echo "start log: $logFile"
  (
    set -x
    startCommon "$ipAddress"
    ssh "${sshOptions[@]}" -f "$ipAddress" \
      "./solana/net/remote/remote-client.sh $deployMethod $entrypointIp \
      $clientToRun \"$RUST_LOG\" \"$benchTpsExtraArgs\" \"$benchExchangeExtraArgs\" $clientIndex"
  ) >> "$logFile" 2>&1 || {
    cat "$logFile"
    echo "^^^ +++"
    exit 1
  }
}

sanity() {
  declare skipBlockstreamerSanity=$1

  $metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"

  declare ok=true
  declare bootstrapLeader=${validatorIpList[0]}
  declare blockstreamer=${blockstreamerIpList[0]}

  annotateBlockexplorerUrl

  echo "--- Sanity: $bootstrapLeader"
  (
    set -x
    # shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
    ssh "${sshOptions[@]}" "$bootstrapLeader" \
      "./solana/net/remote/remote-sanity.sh $bootstrapLeader $sanityExtraArgs \"$RUST_LOG\""
  ) || ok=false
  $ok || exit 1

  if [[ -z $skipBlockstreamerSanity && -n $blockstreamer ]]; then
    # If there's a blockstreamer node run a reduced sanity check on it as well
    echo "--- Sanity: $blockstreamer"
    (
      set -x
      # shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
      ssh "${sshOptions[@]}" "$blockstreamer" \
        "./solana/net/remote/remote-sanity.sh $blockstreamer $sanityExtraArgs \"$RUST_LOG\""
    ) || ok=false
    $ok || exit 1
  fi

  $metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
}

deployUpdate() {
  if [[ -z $updatePlatforms ]]; then
    echo "No update platforms"
    return
  fi
  if [[ -z $releaseChannel ]]; then
    echo "Release channel not specified (use -t option)"
    exit 1
  fi

  declare ok=true
  declare bootstrapLeader=${validatorIpList[0]}

  for updatePlatform in $updatePlatforms; do
    echo "--- Deploying solana-install update: $updatePlatform"
    (
      set -x

      scripts/solana-install-update-manifest-keypair.sh "$updatePlatform"

      timeout 30s scp "${sshOptions[@]}" \
        update_manifest_keypair.json "$bootstrapLeader:solana/update_manifest_keypair.json"

      # shellcheck disable=SC2029 # remote-deploy-update.sh args are expanded on client side intentionally
      ssh "${sshOptions[@]}" "$bootstrapLeader" \
        "./solana/net/remote/remote-deploy-update.sh $releaseChannel $updatePlatform"
    ) || ok=false
    $ok || exit 1
  done
}

getNodeType() {
  echo "getNodeType: $nodeAddress"
  [[ -n $nodeAddress ]] || {
    echo "Error: nodeAddress not set"
    exit 1
  }
  nodeIndex=0 # <-- global
  nodeType=validator # <-- global

  for ipAddress in "${validatorIpList[@]}" b "${blockstreamerIpList[@]}" r "${archiverIpList[@]}"; do
    if [[ $ipAddress = b ]]; then
      nodeType=blockstreamer
      continue
    elif [[ $ipAddress = r ]]; then
      nodeType=archiver
      continue
    fi

    if [[ $ipAddress = "$nodeAddress" ]]; then
      echo "getNodeType: $nodeType ($nodeIndex)"
      return
    fi
    ((nodeIndex = nodeIndex + 1))
  done

  echo "Error: Unknown node: $nodeAddress"
  exit 1
}

prepare_deploy() {
  case $deployMethod in
  tar)
    if [[ -n $releaseChannel ]]; then
      rm -f "$SOLANA_ROOT"/solana-release.tar.bz2
      declare updateDownloadUrl=http://release.solana.com/"$releaseChannel"/solana-release-x86_64-unknown-linux-gnu.tar.bz2
      (
        set -x
        curl --retry 5 --retry-delay 2 --retry-connrefused \
          -o "$SOLANA_ROOT"/solana-release.tar.bz2 "$updateDownloadUrl"
      )
      tarballFilename="$SOLANA_ROOT"/solana-release.tar.bz2
    fi
    (
      set -x
      rm -rf "$SOLANA_ROOT"/solana-release
      (cd "$SOLANA_ROOT"; tar jxv) < "$tarballFilename"
      cat "$SOLANA_ROOT"/solana-release/version.yml
    )
    ;;
  local)
    if $doBuild; then
      build
    else
      echo "Build skipped due to --no-build"
    fi
    ;;
  skip)
    ;;
  *)
    usage "Internal error: invalid deployMethod: $deployMethod"
    ;;
  esac

  if [[ -n $deployIfNewer ]]; then
    if [[ $deployMethod != tar ]]; then
      echo "Error: --deploy-if-newer only supported for tar deployments"
      exit 1
    fi

    echo "Fetching current software version"
    (
      set -x
      rsync -vPrc -e "ssh ${sshOptions[*]}" "${validatorIpList[0]}":~/version.yml current-version.yml
    )
    cat current-version.yml
    if ! diff -q current-version.yml "$SOLANA_ROOT"/solana-release/version.yml; then
      echo "Cluster software version is old.  Update required"
    else
      echo "Cluster software version is current.  No update required"
      exit 0
    fi
  fi
}

deploy() {
  initLogDir

  echo "Deployment started at $(date)"
  $metricsWriteDatapoint "testnet-deploy net-start-begin=1"

  declare bootstrapLeader=true
  for nodeAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}"; do
    nodeType=
    nodeIndex=
    getNodeType
    if $bootstrapLeader; then
      SECONDS=0
      declare bootstrapNodeDeployTime=
      startBootstrapLeader "$nodeAddress" $nodeIndex "$netLogDir/bootstrap-leader-$ipAddress.log"
      bootstrapNodeDeployTime=$SECONDS
      $metricsWriteDatapoint "testnet-deploy net-bootnode-leader-started=1"

      bootstrapLeader=false
      SECONDS=0
      pids=()
    else
      startNode "$ipAddress" $nodeType $nodeIndex

      # Stagger additional node start time. If too many nodes start simultaneously
      # the bootstrap node gets more rsync requests from the additional nodes than
      # it can handle.
      sleep 2
    fi
  done


  for pid in "${pids[@]}"; do
    declare ok=true
    wait "$pid" || ok=false
    if ! $ok; then
      echo "+++ validator failed to start"
      cat "$netLogDir/validator-$pid.log"
      if $failOnValidatorBootupFailure; then
        exit 1
      else
        echo "Failure is non-fatal"
      fi
    fi
  done

  $metricsWriteDatapoint "testnet-deploy net-validators-started=1"
  additionalNodeDeployTime=$SECONDS

  annotateBlockexplorerUrl

  sanity skipBlockstreamerSanity # skip sanity on blockstreamer node, it may not
                                 # have caught up to the bootstrap leader yet

  SECONDS=0
  for ((i=0; i < "$numClients" && i < "$numClientsRequested"; i++)) do
    if [[ $i -lt "$numBenchTpsClients" ]]; then
      startClient "${clientIpList[$i]}" "solana-bench-tps" "$i"
    elif [[ $i -lt $((numBenchTpsClients + numBenchExchangeClients)) ]]; then
      startClient "${clientIpList[$i]}" "solana-bench-exchange" $((i-numBenchTpsClients))
    else
      startClient "${clientIpList[$i]}" "idle"
    fi
  done
  clientDeployTime=$SECONDS

  $metricsWriteDatapoint "testnet-deploy net-start-complete=1"

  declare networkVersion=unknown
  case $deployMethod in
  tar)
    networkVersion="$(
      (
        set -o pipefail
        grep "^commit: " "$SOLANA_ROOT"/solana-release/version.yml | head -n1 | cut -d\  -f2
      ) || echo "tar-unknown"
    )"
    ;;
  local)
    networkVersion="$(git rev-parse HEAD || echo local-unknown)"
    ;;
  skip)
    ;;
  *)
    usage "Internal error: invalid deployMethod: $deployMethod"
    ;;
  esac
  $metricsWriteDatapoint "testnet-deploy version=\"${networkVersion:0:9}\""

  echo
  echo "+++ Deployment Successful"
  echo "Bootstrap leader deployment took $bootstrapNodeDeployTime seconds"
  echo "Additional validator deployment (${#validatorIpList[@]} validators, ${#blockstreamerIpList[@]} blockstreamer nodes, ${#archiverIpList[@]} archivers) took $additionalNodeDeployTime seconds"
  echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
  echo "Network start logs in $netLogDir"
}

stopNode() {
  local ipAddress=$1
  local block=$2

  initLogDir
  declare logFile="$netLogDir/stop-validator-$ipAddress.log"

  echo "--- Stopping node: $ipAddress"
  echo "stop log: $logFile"
  (
    set -x
    # shellcheck disable=SC2029 # It's desired that PS4 be expanded on the client side
    ssh "${sshOptions[@]}" "$ipAddress" "
      PS4=\"$PS4\"
      set -x
      ! tmux list-sessions || tmux kill-session
      declare sudo=
      if sudo true; then
        sudo=\"sudo -n\"
      fi

      for pid in solana/*.pid; do
        pgid=\$(ps opgid= \$(cat \$pid) | tr -d '[:space:]')
        if [[ -n \$pgid ]]; then
          \$sudo kill -- -\$pgid
        fi
      done
      if [[ -f solana/netem.cfg ]]; then
        solana/scripts/netem.sh delete < solana/netem.cfg
        rm -f solana/netem.cfg
      fi
      solana/scripts/net-shaper.sh force_cleanup
      for pattern in node solana- remote-; do
        pkill -9 \$pattern
      done
    "
  ) >> "$logFile" 2>&1 &

  declare pid=$!
  ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log"
  if $block; then
    wait $pid
  else
    pids+=("$pid")
  fi
}

stop() {
  SECONDS=0
  $metricsWriteDatapoint "testnet-deploy net-stop-begin=1"

  declare loopCount=0
  pids=()
  for ipAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}" "${clientIpList[@]}"; do
    stopNode "$ipAddress" false

    # Stagger additional node stop time to avoid too many concurrent ssh
    # sessions
    ((loopCount++ % 4 == 0)) && sleep 2
  done

  echo --- Waiting for nodes to finish stopping
  for pid in "${pids[@]}"; do
    echo -n "$pid "
    wait "$pid" || true
  done
  echo

  $metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
  echo "Stopping nodes took $SECONDS seconds"
}

checkPremptibleInstances() {
  # The validatorIpList nodes may be preemptible instances that can disappear at
  # any time.  Try to detect when a validator has been preempted to help the user
  # out.
  #
  # Of course this isn't airtight as an instance could always disappear
  # immediately after its successfully pinged.
  for ipAddress in "${validatorIpList[@]}"; do
    (
      set -x
      timeout 5s ping -c 1 "$ipAddress" | tr - _
    ) || {
      cat <<EOF

Warning: $ipAddress may have been preempted.

Run |./gce.sh config| to restart it
EOF
      exit 1
    }
  done
}

checkPremptibleInstances

case $command in
restart)
  prepare_deploy
  stop
  deploy
  ;;
start)
  prepare_deploy
  deploy
  ;;
sanity)
  sanity
  ;;
stop)
  stop
  ;;
update)
  deployUpdate
  ;;
stopnode)
  if [[ -z $nodeAddress ]]; then
    usage "node address (-i) not specified"
    exit 1
  fi
  stopNode "$nodeAddress" true
  ;;
startnode)
  if [[ -z $nodeAddress ]]; then
    usage "node address (-i) not specified"
    exit 1
  fi
  nodeType=
  nodeIndex=
  getNodeType
  startNode "$nodeAddress" $nodeType $nodeIndex
  ;;
logs)
  initLogDir
  fetchRemoteLog() {
    declare ipAddress=$1
    declare log=$2
    echo "--- fetching $log from $ipAddress"
    (
      set -x
      timeout 30s scp "${sshOptions[@]}" \
        "$ipAddress":solana/"$log".log "$netLogDir"/remote-"$log"-"$ipAddress".log
    ) || echo "failed to fetch log"
  }
  fetchRemoteLog "${validatorIpList[0]}" drone
  for ipAddress in "${validatorIpList[@]}"; do
    fetchRemoteLog "$ipAddress" validator
  done
  for ipAddress in "${clientIpList[@]}"; do
    fetchRemoteLog "$ipAddress" client
  done
  for ipAddress in "${blockstreamerIpList[@]}"; do
    fetchRemoteLog "$ipAddress" validator
  done
  for ipAddress in "${archiverIpList[@]}"; do
    fetchRemoteLog "$ipAddress" validator
  done
  ;;
netem)
  if [[ -n $netemConfigFile ]]; then
    if [[ $netemCommand = "add" ]]; then
      for ipAddress in "${validatorIpList[@]}"; do
        "$here"/scp.sh "$netemConfigFile" solana@"$ipAddress":~/solana
      done
    fi
    for i in "${!validatorIpList[@]}"; do
      "$here"/ssh.sh solana@"${validatorIpList[$i]}" 'solana/scripts/net-shaper.sh' \
      "$netemCommand" ~solana/solana/"$netemConfigFile" "${#validatorIpList[@]}" "$i"
    done
  else
    num_nodes=$((${#validatorIpList[@]}*netemPartition/100))
    if [[ $((${#validatorIpList[@]}*netemPartition%100)) -gt 0 ]]; then
      num_nodes=$((num_nodes+1))
    fi
    if [[ "$num_nodes" -gt "${#validatorIpList[@]}" ]]; then
      num_nodes=${#validatorIpList[@]}
    fi

    # Stop netem on all nodes
    for ipAddress in "${validatorIpList[@]}"; do
      "$here"/ssh.sh solana@"$ipAddress" 'solana/scripts/netem.sh delete < solana/netem.cfg || true'
    done

    # Start netem on required nodes
    for ((i=0; i<num_nodes; i++ )); do :
      "$here"/ssh.sh solana@"${validatorIpList[$i]}" "echo $netemConfig > solana/netem.cfg; solana/scripts/netem.sh add \"$netemConfig\""
    done
  fi
  ;;
*)
  echo "Internal error: Unknown command: $command"
  usage
  exit 1
esac