1050 lines
31 KiB
Bash
Executable File
1050 lines
31 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -e
|
|
|
|
here=$(dirname "$0")
|
|
SOLANA_ROOT="$(cd "$here"/..; pwd)"
|
|
|
|
# shellcheck source=net/common.sh
|
|
source "$here"/common.sh
|
|
|
|
usage() {
|
|
exitcode=0
|
|
if [[ -n "$1" ]]; then
|
|
exitcode=1
|
|
echo "Error: $*"
|
|
fi
|
|
cat <<EOF
|
|
usage: $0 [start|stop|restart|sanity] [command-specific options]
|
|
|
|
Operate a configured testnet
|
|
|
|
start - Start the network
|
|
sanity - Sanity check the network
|
|
stop - Stop the network
|
|
restart - Shortcut for stop then start
|
|
logs - Fetch remote logs from each network node
|
|
startnode- Start an individual node (previously stopped with stopNode)
|
|
stopnode - Stop an individual node
|
|
update - Deploy a new software update to the cluster
|
|
|
|
start-specific options:
|
|
-T [tarFilename] - Deploy the specified release tarball
|
|
-t edge|beta|stable|vX.Y.Z - Deploy the latest tarball release for the
|
|
specified release channel (edge|beta|stable) or release tag
|
|
(vX.Y.Z)
|
|
-r / --skip-setup - Reuse existing node/ledger configuration from a
|
|
previous |start| (ie, don't run ./multinode-demo/setup.sh).
|
|
-d / --debug - Build/deploy the testnet with debug binaries
|
|
-c clientType=numClients=extraArgs - Number of clientTypes to start. This options can be specified
|
|
more than once. Defaults to bench-tps for all clients if not
|
|
specified.
|
|
Valid client types are:
|
|
idle
|
|
bench-tps
|
|
bench-exchange
|
|
User can optionally provide extraArgs that are transparently
|
|
supplied to the client program as command line parameters.
|
|
For example,
|
|
-c bench-tps=2="--tx_count 25000"
|
|
This will start 2 bench-tps clients, and supply "--tx_count 25000"
|
|
to the bench-tps client.
|
|
-n NUM_VALIDATORS - Number of validators to apply command to.
|
|
--gpu-mode GPU_MODE - Specify GPU mode to launch validators with (default: $gpuMode).
|
|
MODE must be one of
|
|
on - GPU *required*, any vendor *
|
|
off - No GPU, CPU-only
|
|
auto - Use GPU if available, any vendor *
|
|
cuda - GPU *required*, Nvidia CUDA only
|
|
* Currently, Nvidia CUDA is the only supported GPU vendor
|
|
--hashes-per-tick NUM_HASHES|sleep|auto
|
|
- Override the default --hashes-per-tick for the cluster
|
|
--no-airdrop
|
|
- If set, disables airdrops. Nodes must be funded in genesis config when airdrops are disabled.
|
|
--faucet-lamports NUM_LAMPORTS_TO_MINT
|
|
- Override the default 500000000000000000 lamports minted in genesis
|
|
--internal-nodes-stake-lamports NUM_LAMPORTS_PER_NODE
|
|
- Amount to stake internal nodes.
|
|
--internal-nodes-lamports NUM_LAMPORTS_PER_NODE
|
|
- Amount to fund internal nodes in genesis config.
|
|
--external-accounts-file FILE_PATH
|
|
- A YML file with a list of account pubkeys and corresponding lamport balances
|
|
in genesis config for external nodes
|
|
--no-snapshot-fetch
|
|
- If set, disables booting validators from a snapshot
|
|
--skip-poh-verify
|
|
- If set, validators will skip verifying
|
|
the ledger they already have saved to disk at
|
|
boot (results in a much faster boot)
|
|
--no-deploy
|
|
- Don't deploy new software, use the
|
|
existing deployment
|
|
--no-build
|
|
- Don't build new software, deploy the
|
|
existing binaries
|
|
|
|
--deploy-if-newer - Only deploy if newer software is
|
|
available (requires -t or -T)
|
|
|
|
--use-move - Build the move-loader-program and add it to the cluster
|
|
|
|
--operating-mode development|softlaunch
|
|
- Specify whether or not to launch the cluster in "development" mode with all features enabled at epoch 0,
|
|
or "softlaunch" mode with some features disabled at epoch 0 (default: development)
|
|
|
|
sanity/start-specific options:
|
|
-F - Discard validator nodes that didn't bootup successfully
|
|
-o noInstallCheck - Skip solana-install sanity
|
|
-o rejectExtraNodes - Require the exact number of nodes
|
|
|
|
stop-specific options:
|
|
none
|
|
|
|
logs-specific options:
|
|
none
|
|
|
|
netem-specific options:
|
|
--config - Netem configuration (as a double quoted string)
|
|
--parition - Percentage of network that should be configured with netem
|
|
--config-file - Configuration file for partition and netem configuration
|
|
--netem-cmd - Optional command argument to netem. Default is "add". Use "cleanup" to remove rules.
|
|
|
|
update-specific options:
|
|
--platform linux|osx|windows - Deploy the tarball using 'solana-install deploy ...' for the
|
|
given platform (multiple platforms may be specified)
|
|
(-t option must be supplied as well)
|
|
|
|
startnode/stopnode-specific options:
|
|
-i [ip address] - IP Address of the node to start or stop
|
|
|
|
Note: if RUST_LOG is set in the environment it will be propogated into the
|
|
network nodes.
|
|
EOF
|
|
exit $exitcode
|
|
}
|
|
|
|
releaseChannel=
|
|
deployMethod=local
|
|
deployIfNewer=
|
|
sanityExtraArgs=
|
|
skipSetup=false
|
|
updatePlatforms=
|
|
nodeAddress=
|
|
numIdleClients=0
|
|
numBenchTpsClients=0
|
|
numBenchExchangeClients=0
|
|
benchTpsExtraArgs=
|
|
benchExchangeExtraArgs=
|
|
failOnValidatorBootupFailure=true
|
|
genesisOptions=
|
|
numValidatorsRequested=
|
|
externalPrimordialAccountsFile=
|
|
remoteExternalPrimordialAccountsFile=
|
|
internalNodesStakeLamports=
|
|
internalNodesLamports=
|
|
maybeNoSnapshot=""
|
|
maybeLimitLedgerSize=""
|
|
maybeSkipLedgerVerify=""
|
|
maybeDisableAirdrops=""
|
|
debugBuild=false
|
|
doBuild=true
|
|
gpuMode=auto
|
|
maybeUseMove=""
|
|
netemPartition=""
|
|
netemConfig=""
|
|
netemConfigFile=""
|
|
netemCommand="add"
|
|
|
|
command=$1
|
|
[[ -n $command ]] || usage
|
|
shift
|
|
|
|
shortArgs=()
|
|
while [[ -n $1 ]]; do
|
|
if [[ ${1:0:2} = -- ]]; then
|
|
if [[ $1 = --hashes-per-tick ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --slots-per-epoch ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --target-lamports-per-signature ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --faucet-lamports ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --operating-mode ]]; then
|
|
case "$2" in
|
|
development|softlaunch)
|
|
;;
|
|
*)
|
|
echo "Unexpected operating mode: \"$2\""
|
|
exit 1
|
|
;;
|
|
esac
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --no-snapshot-fetch ]]; then
|
|
maybeNoSnapshot="$1"
|
|
shift 1
|
|
elif [[ $1 = --deploy-if-newer ]]; then
|
|
deployIfNewer=1
|
|
shift 1
|
|
elif [[ $1 = --no-deploy ]]; then
|
|
deployMethod=skip
|
|
shift 1
|
|
elif [[ $1 = --no-build ]]; then
|
|
doBuild=false
|
|
shift 1
|
|
elif [[ $1 = --limit-ledger-size ]]; then
|
|
maybeLimitLedgerSize="$1"
|
|
shift 1
|
|
elif [[ $1 = --skip-poh-verify ]]; then
|
|
maybeSkipLedgerVerify="$1"
|
|
shift 1
|
|
elif [[ $1 = --skip-setup ]]; then
|
|
skipSetup=true
|
|
shift 1
|
|
elif [[ $1 = --platform ]]; then
|
|
updatePlatforms="$updatePlatforms $2"
|
|
shift 2
|
|
elif [[ $1 = --internal-nodes-stake-lamports ]]; then
|
|
internalNodesStakeLamports="$2"
|
|
shift 2
|
|
elif [[ $1 = --internal-nodes-lamports ]]; then
|
|
internalNodesLamports="$2"
|
|
shift 2
|
|
elif [[ $1 = --external-accounts-file ]]; then
|
|
externalPrimordialAccountsFile="$2"
|
|
remoteExternalPrimordialAccountsFile=/tmp/external-primordial-accounts.yml
|
|
shift 2
|
|
elif [[ $1 = --no-airdrop ]]; then
|
|
maybeDisableAirdrops="$1"
|
|
shift 1
|
|
elif [[ $1 = --debug ]]; then
|
|
debugBuild=true
|
|
shift 1
|
|
elif [[ $1 = --use-move ]]; then
|
|
maybeUseMove=$1
|
|
shift 1
|
|
elif [[ $1 = --partition ]]; then
|
|
netemPartition=$2
|
|
shift 2
|
|
elif [[ $1 = --config ]]; then
|
|
netemConfig=$2
|
|
shift 2
|
|
elif [[ $1 == --config-file ]]; then
|
|
netemConfigFile=$2
|
|
shift 2
|
|
elif [[ $1 == --netem-cmd ]]; then
|
|
netemCommand=$2
|
|
shift 2
|
|
elif [[ $1 = --gpu-mode ]]; then
|
|
gpuMode=$2
|
|
case "$gpuMode" in
|
|
on|off|auto|cuda)
|
|
;;
|
|
*)
|
|
echo "Unexpected GPU mode: \"$gpuMode\""
|
|
exit 1
|
|
;;
|
|
esac
|
|
shift 2
|
|
else
|
|
usage "Unknown long option: $1"
|
|
fi
|
|
else
|
|
shortArgs+=("$1")
|
|
shift
|
|
fi
|
|
done
|
|
|
|
while getopts "h?T:t:o:f:rc:Fn:i:d" opt "${shortArgs[@]}"; do
|
|
case $opt in
|
|
h | \?)
|
|
usage
|
|
;;
|
|
T)
|
|
tarballFilename=$OPTARG
|
|
[[ -r $tarballFilename ]] || usage "File not readable: $tarballFilename"
|
|
deployMethod=tar
|
|
;;
|
|
t)
|
|
case $OPTARG in
|
|
edge|beta|stable|v*)
|
|
releaseChannel=$OPTARG
|
|
deployMethod=tar
|
|
;;
|
|
*)
|
|
usage "Invalid release channel: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
n)
|
|
numValidatorsRequested=$OPTARG
|
|
;;
|
|
r)
|
|
skipSetup=true
|
|
;;
|
|
o)
|
|
case $OPTARG in
|
|
rejectExtraNodes|noInstallCheck)
|
|
sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
|
|
;;
|
|
*)
|
|
usage "Unknown option: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
c)
|
|
getClientTypeAndNum() {
|
|
if ! [[ $OPTARG == *'='* ]]; then
|
|
echo "Error: Expecting tuple \"clientType=numClientType=extraArgs\" but got \"$OPTARG\""
|
|
exit 1
|
|
fi
|
|
local keyValue
|
|
IFS='=' read -ra keyValue <<< "$OPTARG"
|
|
local clientType=${keyValue[0]}
|
|
local numClients=${keyValue[1]}
|
|
local extraArgs=${keyValue[2]}
|
|
re='^[0-9]+$'
|
|
if ! [[ $numClients =~ $re ]] ; then
|
|
echo "error: numClientType must be a number but got \"$numClients\""
|
|
exit 1
|
|
fi
|
|
case $clientType in
|
|
idle)
|
|
numIdleClients=$numClients
|
|
# $extraArgs ignored for 'idle'
|
|
;;
|
|
bench-tps)
|
|
numBenchTpsClients=$numClients
|
|
benchTpsExtraArgs=$extraArgs
|
|
;;
|
|
bench-exchange)
|
|
numBenchExchangeClients=$numClients
|
|
benchExchangeExtraArgs=$extraArgs
|
|
;;
|
|
*)
|
|
echo "Unknown client type: $clientType"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
getClientTypeAndNum
|
|
;;
|
|
F)
|
|
failOnValidatorBootupFailure=false
|
|
;;
|
|
i)
|
|
nodeAddress=$OPTARG
|
|
;;
|
|
d)
|
|
debugBuild=true
|
|
;;
|
|
*)
|
|
usage "Error: unhandled option: $opt"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
loadConfigFile
|
|
|
|
netLogDir=
|
|
initLogDir() { # Initializes the netLogDir global variable. Idempotent
|
|
[[ -z $netLogDir ]] || return 0
|
|
|
|
netLogDir="$netDir"/log
|
|
declare netLogDateDir
|
|
netLogDateDir="$netDir"/log-$(date +"%Y-%m-%d_%H_%M_%S")
|
|
if [[ -d $netLogDir && ! -L $netLogDir ]]; then
|
|
echo "Warning: moving $netLogDir to make way for symlink."
|
|
mv "$netLogDir" "$netDir"/log.old
|
|
elif [[ -L $netLogDir ]]; then
|
|
rm "$netLogDir"
|
|
fi
|
|
mkdir -p "$netConfigDir" "$netLogDateDir"
|
|
ln -sf "$netLogDateDir" "$netLogDir"
|
|
echo "Log directory: $netLogDateDir"
|
|
}
|
|
|
|
if [[ -n $numValidatorsRequested ]]; then
|
|
truncatedNodeList=( "${validatorIpList[@]:0:$numValidatorsRequested}" )
|
|
unset validatorIpList
|
|
validatorIpList=( "${truncatedNodeList[@]}" )
|
|
fi
|
|
|
|
numClients=${#clientIpList[@]}
|
|
numClientsRequested=$((numBenchTpsClients + numBenchExchangeClients + numIdleClients))
|
|
if [[ "$numClientsRequested" -eq 0 ]]; then
|
|
numBenchTpsClients=$numClients
|
|
numClientsRequested=$numClients
|
|
else
|
|
if [[ "$numClientsRequested" -gt "$numClients" ]]; then
|
|
echo "Error: More clients requested ($numClientsRequested) then available ($numClients)"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
annotate() {
|
|
[[ -z $BUILDKITE ]] || {
|
|
buildkite-agent annotate "$@"
|
|
}
|
|
}
|
|
|
|
annotateBlockexplorerUrl() {
|
|
declare blockstreamer=${blockstreamerIpList[0]}
|
|
|
|
if [[ -n $blockstreamer ]]; then
|
|
annotate --style info --context blockexplorer-url "Block explorer: http://$blockstreamer/"
|
|
fi
|
|
}
|
|
|
|
build() {
|
|
supported=("18.04")
|
|
declare MAYBE_DOCKER=
|
|
if [[ $(uname) != Linux || ! " ${supported[*]} " =~ $(lsb_release -sr) ]]; then
|
|
# shellcheck source=ci/rust-version.sh
|
|
source "$SOLANA_ROOT"/ci/rust-version.sh
|
|
MAYBE_DOCKER="ci/docker-run.sh $rust_stable_docker_image"
|
|
fi
|
|
SECONDS=0
|
|
(
|
|
cd "$SOLANA_ROOT"
|
|
echo "--- Build started at $(date)"
|
|
|
|
set -x
|
|
rm -rf farf
|
|
|
|
buildVariant=
|
|
if $debugBuild; then
|
|
buildVariant=debug
|
|
fi
|
|
|
|
$MAYBE_DOCKER bash -c "
|
|
set -ex
|
|
scripts/cargo-install-all.sh farf \"$buildVariant\" \"$maybeUseMove\"
|
|
"
|
|
)
|
|
echo "Build took $SECONDS seconds"
|
|
}
|
|
|
|
startCommon() {
|
|
declare ipAddress=$1
|
|
test -d "$SOLANA_ROOT"
|
|
if $skipSetup; then
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
set -x;
|
|
mkdir -p ~/solana/config;
|
|
rm -rf ~/config;
|
|
mv ~/solana/config ~;
|
|
rm -rf ~/solana;
|
|
mkdir -p ~/solana ~/.cargo/bin;
|
|
mv ~/config ~/solana/
|
|
"
|
|
else
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
set -x;
|
|
rm -rf ~/solana;
|
|
mkdir -p ~/.cargo/bin
|
|
"
|
|
fi
|
|
[[ -z "$externalNodeSshKey" ]] || ssh-copy-id -f -i "$externalNodeSshKey" "${sshOptions[@]}" "solana@$ipAddress"
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" \
|
|
--exclude 'net/log*' \
|
|
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
|
|
"$ipAddress":~/solana/
|
|
}
|
|
|
|
startBootstrapLeader() {
|
|
declare ipAddress=$1
|
|
declare nodeIndex="$2"
|
|
declare logFile="$3"
|
|
echo "--- Starting bootstrap leader: $ipAddress"
|
|
echo "start log: $logFile"
|
|
|
|
# Deploy local binaries to bootstrap validator. Other validators and clients later fetch the
|
|
# binaries from it
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress" || exit 1
|
|
[[ -z "$externalPrimordialAccountsFile" ]] || rsync -vPrc -e "ssh ${sshOptions[*]}" "$externalPrimordialAccountsFile" \
|
|
"$ipAddress:$remoteExternalPrimordialAccountsFile"
|
|
case $deployMethod in
|
|
tar)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/bin/* "$ipAddress:~/.cargo/bin/"
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/version.yml "$ipAddress:~/"
|
|
;;
|
|
local)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" "rm -f ~/version.yml; touch ~/version.yml"
|
|
;;
|
|
skip)
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh \
|
|
$deployMethod \
|
|
bootstrap-leader \
|
|
$entrypointIp \
|
|
$((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
|
|
\"$RUST_LOG\" \
|
|
$skipSetup \
|
|
$failOnValidatorBootupFailure \
|
|
\"$remoteExternalPrimordialAccountsFile\" \
|
|
\"$maybeDisableAirdrops\" \
|
|
\"$internalNodesStakeLamports\" \
|
|
\"$internalNodesLamports\" \
|
|
$nodeIndex \
|
|
$numBenchTpsClients \"$benchTpsExtraArgs\" \
|
|
$numBenchExchangeClients \"$benchExchangeExtraArgs\" \
|
|
\"$genesisOptions\" \
|
|
\"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
|
|
\"$gpuMode\" \
|
|
\"$GEOLOCATION_API_KEY\" \
|
|
"
|
|
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
startNode() {
|
|
declare ipAddress=$1
|
|
declare nodeType=$2
|
|
declare nodeIndex="$3"
|
|
|
|
initLogDir
|
|
declare logFile="$netLogDir/validator-$ipAddress.log"
|
|
|
|
if [[ -z $nodeType ]]; then
|
|
echo nodeType not specified
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z $nodeIndex ]]; then
|
|
echo nodeIndex not specified
|
|
exit 1
|
|
fi
|
|
|
|
echo "--- Starting $nodeType: $ipAddress"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
|
|
if [[ $nodeType = blockstreamer ]] && [[ -n $letsEncryptDomainName ]]; then
|
|
#
|
|
# Create/renew TLS certificate
|
|
#
|
|
declare localArchive=~/letsencrypt-"$letsEncryptDomainName".tgz
|
|
if [[ -r "$localArchive" ]]; then
|
|
timeout 30s scp "${sshOptions[@]}" "$localArchive" "$ipAddress:letsencrypt.tgz"
|
|
fi
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"sudo -H /certbot-restore.sh $letsEncryptDomainName maintainers@solana.com"
|
|
rm -f letsencrypt.tgz
|
|
timeout 30s scp "${sshOptions[@]}" "$ipAddress:/letsencrypt.tgz" letsencrypt.tgz
|
|
test -s letsencrypt.tgz # Ensure non-empty before overwriting $localArchive
|
|
cp letsencrypt.tgz "$localArchive"
|
|
fi
|
|
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh \
|
|
$deployMethod \
|
|
$nodeType \
|
|
$entrypointIp \
|
|
$((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
|
|
\"$RUST_LOG\" \
|
|
$skipSetup \
|
|
$failOnValidatorBootupFailure \
|
|
\"$remoteExternalPrimordialAccountsFile\" \
|
|
\"$maybeDisableAirdrops\" \
|
|
\"$internalNodesStakeLamports\" \
|
|
\"$internalNodesLamports\" \
|
|
$nodeIndex \
|
|
$numBenchTpsClients \"$benchTpsExtraArgs\" \
|
|
$numBenchExchangeClients \"$benchExchangeExtraArgs\" \
|
|
\"$genesisOptions\" \
|
|
\"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
|
|
\"$gpuMode\" \
|
|
\"$GEOLOCATION_API_KEY\" \
|
|
"
|
|
) >> "$logFile" 2>&1 &
|
|
declare pid=$!
|
|
ln -sf "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
|
|
pids+=("$pid")
|
|
}
|
|
|
|
startClient() {
|
|
declare ipAddress=$1
|
|
declare clientToRun="$2"
|
|
declare clientIndex="$3"
|
|
|
|
initLogDir
|
|
declare logFile="$netLogDir/client-$clientToRun-$ipAddress.log"
|
|
|
|
echo "--- Starting client: $ipAddress - $clientToRun"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
ssh "${sshOptions[@]}" -f "$ipAddress" \
|
|
"./solana/net/remote/remote-client.sh $deployMethod $entrypointIp \
|
|
$clientToRun \"$RUST_LOG\" \"$benchTpsExtraArgs\" \"$benchExchangeExtraArgs\" $clientIndex"
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
sanity() {
|
|
declare skipBlockstreamerSanity=$1
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"
|
|
|
|
declare ok=true
|
|
declare bootstrapLeader=${validatorIpList[0]}
|
|
declare blockstreamer=${blockstreamerIpList[0]}
|
|
|
|
annotateBlockexplorerUrl
|
|
|
|
echo "--- Sanity: $bootstrapLeader"
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$bootstrapLeader" \
|
|
"./solana/net/remote/remote-sanity.sh $bootstrapLeader $sanityExtraArgs \"$RUST_LOG\""
|
|
) || ok=false
|
|
$ok || exit 1
|
|
|
|
if [[ -z $skipBlockstreamerSanity && -n $blockstreamer ]]; then
|
|
# If there's a blockstreamer node run a reduced sanity check on it as well
|
|
echo "--- Sanity: $blockstreamer"
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$blockstreamer" \
|
|
"./solana/net/remote/remote-sanity.sh $blockstreamer $sanityExtraArgs \"$RUST_LOG\""
|
|
) || ok=false
|
|
$ok || exit 1
|
|
fi
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
|
|
}
|
|
|
|
deployUpdate() {
|
|
if [[ -z $updatePlatforms ]]; then
|
|
echo "No update platforms"
|
|
return
|
|
fi
|
|
if [[ -z $releaseChannel ]]; then
|
|
echo "Release channel not specified (use -t option)"
|
|
exit 1
|
|
fi
|
|
|
|
declare ok=true
|
|
declare bootstrapLeader=${validatorIpList[0]}
|
|
|
|
for updatePlatform in $updatePlatforms; do
|
|
echo "--- Deploying solana-install update: $updatePlatform"
|
|
(
|
|
set -x
|
|
|
|
scripts/solana-install-update-manifest-keypair.sh "$updatePlatform"
|
|
|
|
timeout 30s scp "${sshOptions[@]}" \
|
|
update_manifest_keypair.json "$bootstrapLeader:solana/update_manifest_keypair.json"
|
|
|
|
# shellcheck disable=SC2029 # remote-deploy-update.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$bootstrapLeader" \
|
|
"./solana/net/remote/remote-deploy-update.sh $releaseChannel $updatePlatform"
|
|
) || ok=false
|
|
$ok || exit 1
|
|
done
|
|
}
|
|
|
|
getNodeType() {
|
|
echo "getNodeType: $nodeAddress"
|
|
[[ -n $nodeAddress ]] || {
|
|
echo "Error: nodeAddress not set"
|
|
exit 1
|
|
}
|
|
nodeIndex=0 # <-- global
|
|
nodeType=validator # <-- global
|
|
|
|
for ipAddress in "${validatorIpList[@]}" b "${blockstreamerIpList[@]}" r "${archiverIpList[@]}"; do
|
|
if [[ $ipAddress = b ]]; then
|
|
nodeType=blockstreamer
|
|
continue
|
|
elif [[ $ipAddress = r ]]; then
|
|
nodeType=archiver
|
|
continue
|
|
fi
|
|
|
|
if [[ $ipAddress = "$nodeAddress" ]]; then
|
|
echo "getNodeType: $nodeType ($nodeIndex)"
|
|
return
|
|
fi
|
|
((nodeIndex = nodeIndex + 1))
|
|
done
|
|
|
|
echo "Error: Unknown node: $nodeAddress"
|
|
exit 1
|
|
}
|
|
|
|
prepare_deploy() {
|
|
case $deployMethod in
|
|
tar)
|
|
if [[ -n $releaseChannel ]]; then
|
|
rm -f "$SOLANA_ROOT"/solana-release.tar.bz2
|
|
declare updateDownloadUrl=http://release.solana.com/"$releaseChannel"/solana-release-x86_64-unknown-linux-gnu.tar.bz2
|
|
(
|
|
set -x
|
|
curl --retry 5 --retry-delay 2 --retry-connrefused \
|
|
-o "$SOLANA_ROOT"/solana-release.tar.bz2 "$updateDownloadUrl"
|
|
)
|
|
tarballFilename="$SOLANA_ROOT"/solana-release.tar.bz2
|
|
fi
|
|
(
|
|
set -x
|
|
rm -rf "$SOLANA_ROOT"/solana-release
|
|
(cd "$SOLANA_ROOT"; tar jxv) < "$tarballFilename"
|
|
cat "$SOLANA_ROOT"/solana-release/version.yml
|
|
)
|
|
;;
|
|
local)
|
|
if $doBuild; then
|
|
build
|
|
else
|
|
echo "Build skipped due to --no-build"
|
|
fi
|
|
;;
|
|
skip)
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
if [[ -n $deployIfNewer ]]; then
|
|
if [[ $deployMethod != tar ]]; then
|
|
echo "Error: --deploy-if-newer only supported for tar deployments"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Fetching current software version"
|
|
(
|
|
set -x
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "${validatorIpList[0]}":~/version.yml current-version.yml
|
|
)
|
|
cat current-version.yml
|
|
if ! diff -q current-version.yml "$SOLANA_ROOT"/solana-release/version.yml; then
|
|
echo "Cluster software version is old. Update required"
|
|
else
|
|
echo "Cluster software version is current. No update required"
|
|
exit 0
|
|
fi
|
|
fi
|
|
}
|
|
|
|
deploy() {
|
|
initLogDir
|
|
|
|
echo "Deployment started at $(date)"
|
|
$metricsWriteDatapoint "testnet-deploy net-start-begin=1"
|
|
|
|
declare bootstrapLeader=true
|
|
for nodeAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}"; do
|
|
nodeType=
|
|
nodeIndex=
|
|
getNodeType
|
|
if $bootstrapLeader; then
|
|
SECONDS=0
|
|
declare bootstrapNodeDeployTime=
|
|
startBootstrapLeader "$nodeAddress" $nodeIndex "$netLogDir/bootstrap-leader-$ipAddress.log"
|
|
bootstrapNodeDeployTime=$SECONDS
|
|
$metricsWriteDatapoint "testnet-deploy net-bootnode-leader-started=1"
|
|
|
|
bootstrapLeader=false
|
|
SECONDS=0
|
|
pids=()
|
|
else
|
|
startNode "$ipAddress" $nodeType $nodeIndex
|
|
|
|
# Stagger additional node start time. If too many nodes start simultaneously
|
|
# the bootstrap node gets more rsync requests from the additional nodes than
|
|
# it can handle.
|
|
sleep 2
|
|
fi
|
|
done
|
|
|
|
|
|
for pid in "${pids[@]}"; do
|
|
declare ok=true
|
|
wait "$pid" || ok=false
|
|
if ! $ok; then
|
|
echo "+++ validator failed to start"
|
|
cat "$netLogDir/validator-$pid.log"
|
|
if $failOnValidatorBootupFailure; then
|
|
exit 1
|
|
else
|
|
echo "Failure is non-fatal"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-validators-started=1"
|
|
additionalNodeDeployTime=$SECONDS
|
|
|
|
annotateBlockexplorerUrl
|
|
|
|
sanity skipBlockstreamerSanity # skip sanity on blockstreamer node, it may not
|
|
# have caught up to the bootstrap leader yet
|
|
|
|
SECONDS=0
|
|
for ((i=0; i < "$numClients" && i < "$numClientsRequested"; i++)) do
|
|
if [[ $i -lt "$numBenchTpsClients" ]]; then
|
|
startClient "${clientIpList[$i]}" "solana-bench-tps" "$i"
|
|
elif [[ $i -lt $((numBenchTpsClients + numBenchExchangeClients)) ]]; then
|
|
startClient "${clientIpList[$i]}" "solana-bench-exchange" $((i-numBenchTpsClients))
|
|
else
|
|
startClient "${clientIpList[$i]}" "idle"
|
|
fi
|
|
done
|
|
clientDeployTime=$SECONDS
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-start-complete=1"
|
|
|
|
declare networkVersion=unknown
|
|
case $deployMethod in
|
|
tar)
|
|
networkVersion="$(
|
|
(
|
|
set -o pipefail
|
|
grep "^commit: " "$SOLANA_ROOT"/solana-release/version.yml | head -n1 | cut -d\ -f2
|
|
) || echo "tar-unknown"
|
|
)"
|
|
;;
|
|
local)
|
|
networkVersion="$(git rev-parse HEAD || echo local-unknown)"
|
|
;;
|
|
skip)
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
$metricsWriteDatapoint "testnet-deploy version=\"${networkVersion:0:9}\""
|
|
|
|
echo
|
|
echo "+++ Deployment Successful"
|
|
echo "Bootstrap leader deployment took $bootstrapNodeDeployTime seconds"
|
|
echo "Additional validator deployment (${#validatorIpList[@]} validators, ${#blockstreamerIpList[@]} blockstreamer nodes, ${#archiverIpList[@]} archivers) took $additionalNodeDeployTime seconds"
|
|
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
|
|
echo "Network start logs in $netLogDir"
|
|
}
|
|
|
|
stopNode() {
|
|
local ipAddress=$1
|
|
local block=$2
|
|
|
|
initLogDir
|
|
declare logFile="$netLogDir/stop-validator-$ipAddress.log"
|
|
|
|
echo "--- Stopping node: $ipAddress"
|
|
echo "stop log: $logFile"
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # It's desired that PS4 be expanded on the client side
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
PS4=\"$PS4\"
|
|
set -x
|
|
! tmux list-sessions || tmux kill-session
|
|
declare sudo=
|
|
if sudo true; then
|
|
sudo=\"sudo -n\"
|
|
fi
|
|
|
|
for pid in solana/*.pid; do
|
|
pgid=\$(ps opgid= \$(cat \$pid) | tr -d '[:space:]')
|
|
if [[ -n \$pgid ]]; then
|
|
\$sudo kill -- -\$pgid
|
|
fi
|
|
done
|
|
if [[ -f solana/netem.cfg ]]; then
|
|
solana/scripts/netem.sh delete < solana/netem.cfg
|
|
rm -f solana/netem.cfg
|
|
fi
|
|
solana/scripts/net-shaper.sh force_cleanup
|
|
for pattern in node solana- remote-; do
|
|
pkill -9 \$pattern
|
|
done
|
|
"
|
|
) >> "$logFile" 2>&1 &
|
|
|
|
declare pid=$!
|
|
ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log"
|
|
if $block; then
|
|
wait $pid
|
|
else
|
|
pids+=("$pid")
|
|
fi
|
|
}
|
|
|
|
stop() {
|
|
SECONDS=0
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-begin=1"
|
|
|
|
declare loopCount=0
|
|
pids=()
|
|
for ipAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}" "${clientIpList[@]}"; do
|
|
stopNode "$ipAddress" false
|
|
|
|
# Stagger additional node stop time to avoid too many concurrent ssh
|
|
# sessions
|
|
((loopCount++ % 4 == 0)) && sleep 2
|
|
done
|
|
|
|
echo --- Waiting for nodes to finish stopping
|
|
for pid in "${pids[@]}"; do
|
|
echo -n "$pid "
|
|
wait "$pid" || true
|
|
done
|
|
echo
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
|
|
echo "Stopping nodes took $SECONDS seconds"
|
|
}
|
|
|
|
checkPremptibleInstances() {
|
|
# The validatorIpList nodes may be preemptible instances that can disappear at
|
|
# any time. Try to detect when a validator has been preempted to help the user
|
|
# out.
|
|
#
|
|
# Of course this isn't airtight as an instance could always disappear
|
|
# immediately after its successfully pinged.
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
(
|
|
set -x
|
|
timeout 5s ping -c 1 "$ipAddress" | tr - _
|
|
) || {
|
|
cat <<EOF
|
|
|
|
Warning: $ipAddress may have been preempted.
|
|
|
|
Run |./gce.sh config| to restart it
|
|
EOF
|
|
exit 1
|
|
}
|
|
done
|
|
}
|
|
|
|
checkPremptibleInstances
|
|
|
|
case $command in
|
|
restart)
|
|
prepare_deploy
|
|
stop
|
|
deploy
|
|
;;
|
|
start)
|
|
prepare_deploy
|
|
deploy
|
|
;;
|
|
sanity)
|
|
sanity
|
|
;;
|
|
stop)
|
|
stop
|
|
;;
|
|
update)
|
|
deployUpdate
|
|
;;
|
|
stopnode)
|
|
if [[ -z $nodeAddress ]]; then
|
|
usage "node address (-i) not specified"
|
|
exit 1
|
|
fi
|
|
stopNode "$nodeAddress" true
|
|
;;
|
|
startnode)
|
|
if [[ -z $nodeAddress ]]; then
|
|
usage "node address (-i) not specified"
|
|
exit 1
|
|
fi
|
|
nodeType=
|
|
nodeIndex=
|
|
getNodeType
|
|
startNode "$nodeAddress" $nodeType $nodeIndex
|
|
;;
|
|
logs)
|
|
initLogDir
|
|
fetchRemoteLog() {
|
|
declare ipAddress=$1
|
|
declare log=$2
|
|
echo "--- fetching $log from $ipAddress"
|
|
(
|
|
set -x
|
|
timeout 30s scp "${sshOptions[@]}" \
|
|
"$ipAddress":solana/"$log".log "$netLogDir"/remote-"$log"-"$ipAddress".log
|
|
) || echo "failed to fetch log"
|
|
}
|
|
fetchRemoteLog "${validatorIpList[0]}" drone
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" validator
|
|
done
|
|
for ipAddress in "${clientIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" client
|
|
done
|
|
for ipAddress in "${blockstreamerIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" validator
|
|
done
|
|
for ipAddress in "${archiverIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" validator
|
|
done
|
|
;;
|
|
netem)
|
|
if [[ -n $netemConfigFile ]]; then
|
|
if [[ $netemCommand = "add" ]]; then
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
"$here"/scp.sh "$netemConfigFile" solana@"$ipAddress":~/solana
|
|
done
|
|
fi
|
|
for i in "${!validatorIpList[@]}"; do
|
|
"$here"/ssh.sh solana@"${validatorIpList[$i]}" 'solana/scripts/net-shaper.sh' \
|
|
"$netemCommand" ~solana/solana/"$netemConfigFile" "${#validatorIpList[@]}" "$i"
|
|
done
|
|
else
|
|
num_nodes=$((${#validatorIpList[@]}*netemPartition/100))
|
|
if [[ $((${#validatorIpList[@]}*netemPartition%100)) -gt 0 ]]; then
|
|
num_nodes=$((num_nodes+1))
|
|
fi
|
|
if [[ "$num_nodes" -gt "${#validatorIpList[@]}" ]]; then
|
|
num_nodes=${#validatorIpList[@]}
|
|
fi
|
|
|
|
# Stop netem on all nodes
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
"$here"/ssh.sh solana@"$ipAddress" 'solana/scripts/netem.sh delete < solana/netem.cfg || true'
|
|
done
|
|
|
|
# Start netem on required nodes
|
|
for ((i=0; i<num_nodes; i++ )); do :
|
|
"$here"/ssh.sh solana@"${validatorIpList[$i]}" "echo $netemConfig > solana/netem.cfg; solana/scripts/netem.sh add \"$netemConfig\""
|
|
done
|
|
fi
|
|
;;
|
|
*)
|
|
echo "Internal error: Unknown command: $command"
|
|
usage
|
|
exit 1
|
|
esac
|