GCE-based nodes now reboot on maintenance events instead of terminating (#5861)

This commit is contained in:
Michael Vines 2019-09-10 12:30:06 -07:00 committed by GitHub
parent 0d7efe5176
commit fc4aa71193
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 96 additions and 44 deletions

View File

@ -598,6 +598,11 @@ EOF
# autogenerated at $(date)
set -ex
if [[ -f /solana-scratch/.instance-startup-complete ]]; then
# Skip on instance reboot
exit 0
fi
cat > /etc/motd <<EOM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -3,8 +3,6 @@ set -e
cd "$(dirname "$0")"/../..
echo "$(date) | $0 $*" > client.log
deployMethod="$1"
entrypointIp="$2"
clientToRun="$3"
@ -49,17 +47,6 @@ skip)
exit 1
esac
(
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
) > oom-monitor.log 2>&1 &
echo $! > oom-monitor.pid
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
echo $! > fd-monitor.pid
scripts/net-stats.sh > net-stats.log 2>&1 &
echo $! > net-stats.pid
! tmux list-sessions || tmux kill-session
case $clientToRun in
solana-bench-tps)
net/scripts/rsync-retry.sh -vPrc \
@ -97,6 +84,26 @@ solana-bench-exchange)
exit 1
esac
cat > ~/solana/on-reboot <<EOF
#!/usr/bin/env bash
cd ~/solana
PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1
echo "$(date) | $0 $*" >> client.log
(
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
) > oom-monitor.log 2>&1 &
echo $! > oom-monitor.pid
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
echo $! > fd-monitor.pid
scripts/net-stats.sh > net-stats.log 2>&1 &
echo $! > net-stats.pid
! tmux list-sessions || tmux kill-session
tmux new -s "$clientToRun" -d "
while true; do
echo === Client start: \$(date) | tee -a client.log
@ -106,5 +113,11 @@ tmux new -s "$clientToRun" -d "
$metricsWriteDatapoint 'testnet-deploy client-complete=1'
done
"
EOF
chmod +x ~/solana/on-reboot
echo "@reboot ~/solana/on-reboot" | crontab -
~/solana/on-reboot
sleep 1
tmux capture-pane -t "$clientToRun" -p -S -100

View File

@ -59,11 +59,19 @@ genesisOptions="$genesisOptions"
airdropsEnabled=$airdropsEnabled
EOF
source scripts/oom-score-adj.sh
source net/common.sh
loadConfigFile
initCompleteFile=init-complete-node.log
cat > ~/solana/on-reboot <<EOF
#!/usr/bin/env bash
cd ~/solana
source scripts/oom-score-adj.sh
EOF
chmod +x ~/solana/on-reboot
echo "@reboot ~/solana/on-reboot" | crontab -
waitForNodeToInit() {
echo "--- waiting for node to boot up"
SECONDS=
@ -87,6 +95,13 @@ local|tar|skip)
./fetch-perf-libs.sh
# shellcheck source=/dev/null
source ./target/perf-libs/env.sh
cat >> ~/solana/on-reboot <<EOF
PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1
# shellcheck source=/dev/null
source ./target/perf-libs/env.sh
SUDO_OK=1 source scripts/tune-system.sh
(
@ -98,12 +113,14 @@ local|tar|skip)
scripts/net-stats.sh > net-stats.log 2>&1 &
echo $! > net-stats.pid
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
echo Selecting solana-validator-cuda
export SOLANA_CUDA=1
fi
EOF
case $nodeType in
bootstrap-leader)
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
echo Selecting solana-validator-cuda
export SOLANA_CUDA=1
fi
set -x
if [[ $skipSetup != true ]]; then
rm -rf ./solana-node-keys
@ -175,16 +192,24 @@ EOF
)
if [[ $airdropsEnabled = true ]]; then
cat >> ~/solana/on-reboot <<EOF
./multinode-demo/drone.sh > drone.log 2>&1 &
EOF
fi
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
args+=($extraNodeArgs)
nohup ./multinode-demo/bootstrap-leader.sh "${args[@]}" > fullnode.log 2>&1 &
pid=$!
oom_score_adj "$pid" 1000
cat >> ~/solana/on-reboot <<EOF
nohup ./multinode-demo/bootstrap-leader.sh ${args[@]} > fullnode.log 2>&1 &
pid=\$!
oom_score_adj "\$pid" 1000
disown
EOF
~/solana/on-reboot
waitForNodeToInit
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
solana --url http://"$entrypointIp":8899 \
--keypair ~/solana/config/bootstrap-leader/identity-keypair.json \
validator-info publish "$(hostname)" -n team/solana --force || true
;;
validator|blockstreamer)
@ -197,11 +222,6 @@ EOF
"$entrypointIp":~/solana/solana-node-keys/"$nodeIndex" ~/solana/fullnode-identity.json
fi
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
echo Selecting solana-validator-cuda
export SOLANA_CUDA=1
fi
args=(
--entrypoint "$entrypointIp:8001"
--gossip-port 8001
@ -240,7 +260,9 @@ EOF
# a location that somebody would expect to be able to airdrop from
scp "$entrypointIp":~/solana/config/mint-keypair.json config/
if [[ $airdropsEnabled = true ]]; then
cat >> ~/solana/on-reboot <<EOF
./multinode-demo/drone.sh > drone.log 2>&1 &
EOF
fi
# Grab the TLS cert generated by /certbot-restore.sh
@ -249,30 +271,39 @@ EOF
ls -l .cert.pem .key.pem
fi
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
npm install @solana/blockexplorer@1
npx solana-blockexplorer > blockexplorer.log 2>&1 &
# Confirm the blockexplorer is accessible
curl --head --retry 3 --retry-connrefused http://localhost:5000/
cat >> ~/solana/on-reboot <<EOF
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
npx solana-blockexplorer > blockexplorer.log 2>&1 &
# Redirect port 80 to port 5000
sudo iptables -A INPUT -p tcp --dport 80 -j ACCEPT
sudo iptables -A INPUT -p tcp --dport 5000 -j ACCEPT
sudo iptables -A PREROUTING -t nat -p tcp --dport 80 -j REDIRECT --to-port 5000
# Confirm the blockexplorer is now globally accessible
curl --head "$(curl ifconfig.io)"
EOF
fi
args+=(--init-complete-file "$initCompleteFile")
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
args+=($extraNodeArgs)
nohup ./multinode-demo/validator.sh "${args[@]}" > fullnode.log 2>&1 &
pid=$!
oom_score_adj "$pid" 1000
cat >> ~/solana/on-reboot <<EOF
nohup ./multinode-demo/validator.sh ${args[@]} > fullnode.log 2>&1 &
pid=\$!
oom_score_adj "\$pid" 1000
disown
EOF
~/solana/on-reboot
waitForNodeToInit
if [[ $nodeType = blockstreamer ]]; then
# Confirm the blockexplorer is accessible
curl --head --retry 3 --retry-connrefused http://localhost:5000/
# Confirm the blockexplorer is now globally accessible
curl --head "$(curl ifconfig.io)"
fi
if [[ $skipSetup != true && $nodeType != blockstreamer ]]; then
args=(
--url http://"$entrypointIp":8899
@ -289,7 +320,8 @@ EOF
./multinode-demo/delegate-stake.sh "${args[@]}"
fi
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
solana --url http://"$entrypointIp":8899 \
--keypair ~/solana/fullnode-identity.json \
validator-info publish "$(hostname)" -n team/solana --force || true
;;
replicator)
@ -308,9 +340,13 @@ EOF
exit 1
fi
nohup ./multinode-demo/replicator.sh "${args[@]}" > fullnode.log 2>&1 &
pid=$!
oom_score_adj "$pid" 1000
cat >> ~/solana/on-reboot <<EOF
nohup ./multinode-demo/replicator.sh ${args[@]} > fullnode.log 2>&1 &
pid=\$!
oom_score_adj "\$pid" 1000
disown
EOF
~/solana/on-reboot
sleep 1
;;
*)
@ -318,9 +354,9 @@ EOF
exit 1
;;
esac
disown
;;
*)
echo "Unknown deployment method: $deployMethod"
exit 1
esac

View File

@ -167,8 +167,6 @@ cloud_CreateInstances() {
--tags testnet
--metadata "testnet=$networkName"
--image "$imageName"
--maintenance-policy TERMINATE
--no-restart-on-failure
)
# shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args