GCE-based nodes now reboot on maintenance events instead of terminating (#5861)
This commit is contained in:
parent
0d7efe5176
commit
fc4aa71193
|
@ -598,6 +598,11 @@ EOF
|
|||
# autogenerated at $(date)
|
||||
set -ex
|
||||
|
||||
if [[ -f /solana-scratch/.instance-startup-complete ]]; then
|
||||
# Skip on instance reboot
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cat > /etc/motd <<EOM
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
|
|
|
@ -3,8 +3,6 @@ set -e
|
|||
|
||||
cd "$(dirname "$0")"/../..
|
||||
|
||||
echo "$(date) | $0 $*" > client.log
|
||||
|
||||
deployMethod="$1"
|
||||
entrypointIp="$2"
|
||||
clientToRun="$3"
|
||||
|
@ -49,17 +47,6 @@ skip)
|
|||
exit 1
|
||||
esac
|
||||
|
||||
(
|
||||
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
|
||||
) > oom-monitor.log 2>&1 &
|
||||
echo $! > oom-monitor.pid
|
||||
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
|
||||
echo $! > fd-monitor.pid
|
||||
scripts/net-stats.sh > net-stats.log 2>&1 &
|
||||
echo $! > net-stats.pid
|
||||
|
||||
! tmux list-sessions || tmux kill-session
|
||||
|
||||
case $clientToRun in
|
||||
solana-bench-tps)
|
||||
net/scripts/rsync-retry.sh -vPrc \
|
||||
|
@ -97,6 +84,26 @@ solana-bench-exchange)
|
|||
exit 1
|
||||
esac
|
||||
|
||||
|
||||
cat > ~/solana/on-reboot <<EOF
|
||||
#!/usr/bin/env bash
|
||||
cd ~/solana
|
||||
|
||||
PATH="$HOME"/.cargo/bin:"$PATH"
|
||||
export USE_INSTALL=1
|
||||
|
||||
echo "$(date) | $0 $*" >> client.log
|
||||
|
||||
(
|
||||
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
|
||||
) > oom-monitor.log 2>&1 &
|
||||
echo $! > oom-monitor.pid
|
||||
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
|
||||
echo $! > fd-monitor.pid
|
||||
scripts/net-stats.sh > net-stats.log 2>&1 &
|
||||
echo $! > net-stats.pid
|
||||
! tmux list-sessions || tmux kill-session
|
||||
|
||||
tmux new -s "$clientToRun" -d "
|
||||
while true; do
|
||||
echo === Client start: \$(date) | tee -a client.log
|
||||
|
@ -106,5 +113,11 @@ tmux new -s "$clientToRun" -d "
|
|||
$metricsWriteDatapoint 'testnet-deploy client-complete=1'
|
||||
done
|
||||
"
|
||||
EOF
|
||||
chmod +x ~/solana/on-reboot
|
||||
echo "@reboot ~/solana/on-reboot" | crontab -
|
||||
|
||||
~/solana/on-reboot
|
||||
|
||||
sleep 1
|
||||
tmux capture-pane -t "$clientToRun" -p -S -100
|
||||
|
|
|
@ -59,11 +59,19 @@ genesisOptions="$genesisOptions"
|
|||
airdropsEnabled=$airdropsEnabled
|
||||
EOF
|
||||
|
||||
source scripts/oom-score-adj.sh
|
||||
source net/common.sh
|
||||
loadConfigFile
|
||||
|
||||
initCompleteFile=init-complete-node.log
|
||||
|
||||
cat > ~/solana/on-reboot <<EOF
|
||||
#!/usr/bin/env bash
|
||||
cd ~/solana
|
||||
source scripts/oom-score-adj.sh
|
||||
EOF
|
||||
chmod +x ~/solana/on-reboot
|
||||
echo "@reboot ~/solana/on-reboot" | crontab -
|
||||
|
||||
waitForNodeToInit() {
|
||||
echo "--- waiting for node to boot up"
|
||||
SECONDS=
|
||||
|
@ -87,6 +95,13 @@ local|tar|skip)
|
|||
./fetch-perf-libs.sh
|
||||
# shellcheck source=/dev/null
|
||||
source ./target/perf-libs/env.sh
|
||||
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
PATH="$HOME"/.cargo/bin:"$PATH"
|
||||
export USE_INSTALL=1
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source ./target/perf-libs/env.sh
|
||||
SUDO_OK=1 source scripts/tune-system.sh
|
||||
|
||||
(
|
||||
|
@ -98,12 +113,14 @@ local|tar|skip)
|
|||
scripts/net-stats.sh > net-stats.log 2>&1 &
|
||||
echo $! > net-stats.pid
|
||||
|
||||
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
|
||||
echo Selecting solana-validator-cuda
|
||||
export SOLANA_CUDA=1
|
||||
fi
|
||||
EOF
|
||||
|
||||
case $nodeType in
|
||||
bootstrap-leader)
|
||||
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
|
||||
echo Selecting solana-validator-cuda
|
||||
export SOLANA_CUDA=1
|
||||
fi
|
||||
set -x
|
||||
if [[ $skipSetup != true ]]; then
|
||||
rm -rf ./solana-node-keys
|
||||
|
@ -175,16 +192,24 @@ EOF
|
|||
)
|
||||
|
||||
if [[ $airdropsEnabled = true ]]; then
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
./multinode-demo/drone.sh > drone.log 2>&1 &
|
||||
EOF
|
||||
fi
|
||||
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
|
||||
args+=($extraNodeArgs)
|
||||
nohup ./multinode-demo/bootstrap-leader.sh "${args[@]}" > fullnode.log 2>&1 &
|
||||
pid=$!
|
||||
oom_score_adj "$pid" 1000
|
||||
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
nohup ./multinode-demo/bootstrap-leader.sh ${args[@]} > fullnode.log 2>&1 &
|
||||
pid=\$!
|
||||
oom_score_adj "\$pid" 1000
|
||||
disown
|
||||
EOF
|
||||
~/solana/on-reboot
|
||||
waitForNodeToInit
|
||||
|
||||
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
|
||||
solana --url http://"$entrypointIp":8899 \
|
||||
--keypair ~/solana/config/bootstrap-leader/identity-keypair.json \
|
||||
validator-info publish "$(hostname)" -n team/solana --force || true
|
||||
;;
|
||||
validator|blockstreamer)
|
||||
|
@ -197,11 +222,6 @@ EOF
|
|||
"$entrypointIp":~/solana/solana-node-keys/"$nodeIndex" ~/solana/fullnode-identity.json
|
||||
fi
|
||||
|
||||
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
|
||||
echo Selecting solana-validator-cuda
|
||||
export SOLANA_CUDA=1
|
||||
fi
|
||||
|
||||
args=(
|
||||
--entrypoint "$entrypointIp:8001"
|
||||
--gossip-port 8001
|
||||
|
@ -240,7 +260,9 @@ EOF
|
|||
# a location that somebody would expect to be able to airdrop from
|
||||
scp "$entrypointIp":~/solana/config/mint-keypair.json config/
|
||||
if [[ $airdropsEnabled = true ]]; then
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
./multinode-demo/drone.sh > drone.log 2>&1 &
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Grab the TLS cert generated by /certbot-restore.sh
|
||||
|
@ -249,30 +271,39 @@ EOF
|
|||
ls -l .cert.pem .key.pem
|
||||
fi
|
||||
|
||||
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
|
||||
npm install @solana/blockexplorer@1
|
||||
npx solana-blockexplorer > blockexplorer.log 2>&1 &
|
||||
|
||||
# Confirm the blockexplorer is accessible
|
||||
curl --head --retry 3 --retry-connrefused http://localhost:5000/
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
|
||||
npx solana-blockexplorer > blockexplorer.log 2>&1 &
|
||||
|
||||
# Redirect port 80 to port 5000
|
||||
sudo iptables -A INPUT -p tcp --dport 80 -j ACCEPT
|
||||
sudo iptables -A INPUT -p tcp --dport 5000 -j ACCEPT
|
||||
sudo iptables -A PREROUTING -t nat -p tcp --dport 80 -j REDIRECT --to-port 5000
|
||||
|
||||
# Confirm the blockexplorer is now globally accessible
|
||||
curl --head "$(curl ifconfig.io)"
|
||||
EOF
|
||||
fi
|
||||
|
||||
args+=(--init-complete-file "$initCompleteFile")
|
||||
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
|
||||
args+=($extraNodeArgs)
|
||||
nohup ./multinode-demo/validator.sh "${args[@]}" > fullnode.log 2>&1 &
|
||||
pid=$!
|
||||
oom_score_adj "$pid" 1000
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
nohup ./multinode-demo/validator.sh ${args[@]} > fullnode.log 2>&1 &
|
||||
pid=\$!
|
||||
oom_score_adj "\$pid" 1000
|
||||
disown
|
||||
EOF
|
||||
~/solana/on-reboot
|
||||
waitForNodeToInit
|
||||
|
||||
if [[ $nodeType = blockstreamer ]]; then
|
||||
# Confirm the blockexplorer is accessible
|
||||
curl --head --retry 3 --retry-connrefused http://localhost:5000/
|
||||
|
||||
# Confirm the blockexplorer is now globally accessible
|
||||
curl --head "$(curl ifconfig.io)"
|
||||
fi
|
||||
|
||||
if [[ $skipSetup != true && $nodeType != blockstreamer ]]; then
|
||||
args=(
|
||||
--url http://"$entrypointIp":8899
|
||||
|
@ -289,7 +320,8 @@ EOF
|
|||
./multinode-demo/delegate-stake.sh "${args[@]}"
|
||||
fi
|
||||
|
||||
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
|
||||
solana --url http://"$entrypointIp":8899 \
|
||||
--keypair ~/solana/fullnode-identity.json \
|
||||
validator-info publish "$(hostname)" -n team/solana --force || true
|
||||
;;
|
||||
replicator)
|
||||
|
@ -308,9 +340,13 @@ EOF
|
|||
exit 1
|
||||
fi
|
||||
|
||||
nohup ./multinode-demo/replicator.sh "${args[@]}" > fullnode.log 2>&1 &
|
||||
pid=$!
|
||||
oom_score_adj "$pid" 1000
|
||||
cat >> ~/solana/on-reboot <<EOF
|
||||
nohup ./multinode-demo/replicator.sh ${args[@]} > fullnode.log 2>&1 &
|
||||
pid=\$!
|
||||
oom_score_adj "\$pid" 1000
|
||||
disown
|
||||
EOF
|
||||
~/solana/on-reboot
|
||||
sleep 1
|
||||
;;
|
||||
*)
|
||||
|
@ -318,9 +354,9 @@ EOF
|
|||
exit 1
|
||||
;;
|
||||
esac
|
||||
disown
|
||||
;;
|
||||
*)
|
||||
echo "Unknown deployment method: $deployMethod"
|
||||
exit 1
|
||||
esac
|
||||
|
||||
|
|
|
@ -167,8 +167,6 @@ cloud_CreateInstances() {
|
|||
--tags testnet
|
||||
--metadata "testnet=$networkName"
|
||||
--image "$imageName"
|
||||
--maintenance-policy TERMINATE
|
||||
--no-restart-on-failure
|
||||
)
|
||||
|
||||
# shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args
|
||||
|
|
Loading…
Reference in New Issue