GCE-based nodes now reboot on maintenance events instead of terminating (#5861)
This commit is contained in:
parent
0d7efe5176
commit
fc4aa71193
|
@ -598,6 +598,11 @@ EOF
|
||||||
# autogenerated at $(date)
|
# autogenerated at $(date)
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
|
if [[ -f /solana-scratch/.instance-startup-complete ]]; then
|
||||||
|
# Skip on instance reboot
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
cat > /etc/motd <<EOM
|
cat > /etc/motd <<EOM
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,6 @@ set -e
|
||||||
|
|
||||||
cd "$(dirname "$0")"/../..
|
cd "$(dirname "$0")"/../..
|
||||||
|
|
||||||
echo "$(date) | $0 $*" > client.log
|
|
||||||
|
|
||||||
deployMethod="$1"
|
deployMethod="$1"
|
||||||
entrypointIp="$2"
|
entrypointIp="$2"
|
||||||
clientToRun="$3"
|
clientToRun="$3"
|
||||||
|
@ -49,17 +47,6 @@ skip)
|
||||||
exit 1
|
exit 1
|
||||||
esac
|
esac
|
||||||
|
|
||||||
(
|
|
||||||
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
|
|
||||||
) > oom-monitor.log 2>&1 &
|
|
||||||
echo $! > oom-monitor.pid
|
|
||||||
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
|
|
||||||
echo $! > fd-monitor.pid
|
|
||||||
scripts/net-stats.sh > net-stats.log 2>&1 &
|
|
||||||
echo $! > net-stats.pid
|
|
||||||
|
|
||||||
! tmux list-sessions || tmux kill-session
|
|
||||||
|
|
||||||
case $clientToRun in
|
case $clientToRun in
|
||||||
solana-bench-tps)
|
solana-bench-tps)
|
||||||
net/scripts/rsync-retry.sh -vPrc \
|
net/scripts/rsync-retry.sh -vPrc \
|
||||||
|
@ -97,6 +84,26 @@ solana-bench-exchange)
|
||||||
exit 1
|
exit 1
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
|
||||||
|
cat > ~/solana/on-reboot <<EOF
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
cd ~/solana
|
||||||
|
|
||||||
|
PATH="$HOME"/.cargo/bin:"$PATH"
|
||||||
|
export USE_INSTALL=1
|
||||||
|
|
||||||
|
echo "$(date) | $0 $*" >> client.log
|
||||||
|
|
||||||
|
(
|
||||||
|
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
|
||||||
|
) > oom-monitor.log 2>&1 &
|
||||||
|
echo $! > oom-monitor.pid
|
||||||
|
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
|
||||||
|
echo $! > fd-monitor.pid
|
||||||
|
scripts/net-stats.sh > net-stats.log 2>&1 &
|
||||||
|
echo $! > net-stats.pid
|
||||||
|
! tmux list-sessions || tmux kill-session
|
||||||
|
|
||||||
tmux new -s "$clientToRun" -d "
|
tmux new -s "$clientToRun" -d "
|
||||||
while true; do
|
while true; do
|
||||||
echo === Client start: \$(date) | tee -a client.log
|
echo === Client start: \$(date) | tee -a client.log
|
||||||
|
@ -106,5 +113,11 @@ tmux new -s "$clientToRun" -d "
|
||||||
$metricsWriteDatapoint 'testnet-deploy client-complete=1'
|
$metricsWriteDatapoint 'testnet-deploy client-complete=1'
|
||||||
done
|
done
|
||||||
"
|
"
|
||||||
|
EOF
|
||||||
|
chmod +x ~/solana/on-reboot
|
||||||
|
echo "@reboot ~/solana/on-reboot" | crontab -
|
||||||
|
|
||||||
|
~/solana/on-reboot
|
||||||
|
|
||||||
sleep 1
|
sleep 1
|
||||||
tmux capture-pane -t "$clientToRun" -p -S -100
|
tmux capture-pane -t "$clientToRun" -p -S -100
|
||||||
|
|
|
@ -59,11 +59,19 @@ genesisOptions="$genesisOptions"
|
||||||
airdropsEnabled=$airdropsEnabled
|
airdropsEnabled=$airdropsEnabled
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
source scripts/oom-score-adj.sh
|
|
||||||
source net/common.sh
|
source net/common.sh
|
||||||
loadConfigFile
|
loadConfigFile
|
||||||
|
|
||||||
initCompleteFile=init-complete-node.log
|
initCompleteFile=init-complete-node.log
|
||||||
|
|
||||||
|
cat > ~/solana/on-reboot <<EOF
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
cd ~/solana
|
||||||
|
source scripts/oom-score-adj.sh
|
||||||
|
EOF
|
||||||
|
chmod +x ~/solana/on-reboot
|
||||||
|
echo "@reboot ~/solana/on-reboot" | crontab -
|
||||||
|
|
||||||
waitForNodeToInit() {
|
waitForNodeToInit() {
|
||||||
echo "--- waiting for node to boot up"
|
echo "--- waiting for node to boot up"
|
||||||
SECONDS=
|
SECONDS=
|
||||||
|
@ -87,6 +95,13 @@ local|tar|skip)
|
||||||
./fetch-perf-libs.sh
|
./fetch-perf-libs.sh
|
||||||
# shellcheck source=/dev/null
|
# shellcheck source=/dev/null
|
||||||
source ./target/perf-libs/env.sh
|
source ./target/perf-libs/env.sh
|
||||||
|
|
||||||
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
|
PATH="$HOME"/.cargo/bin:"$PATH"
|
||||||
|
export USE_INSTALL=1
|
||||||
|
|
||||||
|
# shellcheck source=/dev/null
|
||||||
|
source ./target/perf-libs/env.sh
|
||||||
SUDO_OK=1 source scripts/tune-system.sh
|
SUDO_OK=1 source scripts/tune-system.sh
|
||||||
|
|
||||||
(
|
(
|
||||||
|
@ -98,12 +113,14 @@ local|tar|skip)
|
||||||
scripts/net-stats.sh > net-stats.log 2>&1 &
|
scripts/net-stats.sh > net-stats.log 2>&1 &
|
||||||
echo $! > net-stats.pid
|
echo $! > net-stats.pid
|
||||||
|
|
||||||
|
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
|
||||||
|
echo Selecting solana-validator-cuda
|
||||||
|
export SOLANA_CUDA=1
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
|
||||||
case $nodeType in
|
case $nodeType in
|
||||||
bootstrap-leader)
|
bootstrap-leader)
|
||||||
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
|
|
||||||
echo Selecting solana-validator-cuda
|
|
||||||
export SOLANA_CUDA=1
|
|
||||||
fi
|
|
||||||
set -x
|
set -x
|
||||||
if [[ $skipSetup != true ]]; then
|
if [[ $skipSetup != true ]]; then
|
||||||
rm -rf ./solana-node-keys
|
rm -rf ./solana-node-keys
|
||||||
|
@ -175,16 +192,24 @@ EOF
|
||||||
)
|
)
|
||||||
|
|
||||||
if [[ $airdropsEnabled = true ]]; then
|
if [[ $airdropsEnabled = true ]]; then
|
||||||
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
./multinode-demo/drone.sh > drone.log 2>&1 &
|
./multinode-demo/drone.sh > drone.log 2>&1 &
|
||||||
|
EOF
|
||||||
fi
|
fi
|
||||||
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
|
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
|
||||||
args+=($extraNodeArgs)
|
args+=($extraNodeArgs)
|
||||||
nohup ./multinode-demo/bootstrap-leader.sh "${args[@]}" > fullnode.log 2>&1 &
|
|
||||||
pid=$!
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
oom_score_adj "$pid" 1000
|
nohup ./multinode-demo/bootstrap-leader.sh ${args[@]} > fullnode.log 2>&1 &
|
||||||
|
pid=\$!
|
||||||
|
oom_score_adj "\$pid" 1000
|
||||||
|
disown
|
||||||
|
EOF
|
||||||
|
~/solana/on-reboot
|
||||||
waitForNodeToInit
|
waitForNodeToInit
|
||||||
|
|
||||||
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
|
solana --url http://"$entrypointIp":8899 \
|
||||||
|
--keypair ~/solana/config/bootstrap-leader/identity-keypair.json \
|
||||||
validator-info publish "$(hostname)" -n team/solana --force || true
|
validator-info publish "$(hostname)" -n team/solana --force || true
|
||||||
;;
|
;;
|
||||||
validator|blockstreamer)
|
validator|blockstreamer)
|
||||||
|
@ -197,11 +222,6 @@ EOF
|
||||||
"$entrypointIp":~/solana/solana-node-keys/"$nodeIndex" ~/solana/fullnode-identity.json
|
"$entrypointIp":~/solana/solana-node-keys/"$nodeIndex" ~/solana/fullnode-identity.json
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
|
|
||||||
echo Selecting solana-validator-cuda
|
|
||||||
export SOLANA_CUDA=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
args=(
|
args=(
|
||||||
--entrypoint "$entrypointIp:8001"
|
--entrypoint "$entrypointIp:8001"
|
||||||
--gossip-port 8001
|
--gossip-port 8001
|
||||||
|
@ -240,7 +260,9 @@ EOF
|
||||||
# a location that somebody would expect to be able to airdrop from
|
# a location that somebody would expect to be able to airdrop from
|
||||||
scp "$entrypointIp":~/solana/config/mint-keypair.json config/
|
scp "$entrypointIp":~/solana/config/mint-keypair.json config/
|
||||||
if [[ $airdropsEnabled = true ]]; then
|
if [[ $airdropsEnabled = true ]]; then
|
||||||
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
./multinode-demo/drone.sh > drone.log 2>&1 &
|
./multinode-demo/drone.sh > drone.log 2>&1 &
|
||||||
|
EOF
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Grab the TLS cert generated by /certbot-restore.sh
|
# Grab the TLS cert generated by /certbot-restore.sh
|
||||||
|
@ -249,30 +271,39 @@ EOF
|
||||||
ls -l .cert.pem .key.pem
|
ls -l .cert.pem .key.pem
|
||||||
fi
|
fi
|
||||||
|
|
||||||
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
|
|
||||||
npm install @solana/blockexplorer@1
|
npm install @solana/blockexplorer@1
|
||||||
npx solana-blockexplorer > blockexplorer.log 2>&1 &
|
|
||||||
|
|
||||||
# Confirm the blockexplorer is accessible
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
curl --head --retry 3 --retry-connrefused http://localhost:5000/
|
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
|
||||||
|
npx solana-blockexplorer > blockexplorer.log 2>&1 &
|
||||||
|
|
||||||
# Redirect port 80 to port 5000
|
# Redirect port 80 to port 5000
|
||||||
sudo iptables -A INPUT -p tcp --dport 80 -j ACCEPT
|
sudo iptables -A INPUT -p tcp --dport 80 -j ACCEPT
|
||||||
sudo iptables -A INPUT -p tcp --dport 5000 -j ACCEPT
|
sudo iptables -A INPUT -p tcp --dport 5000 -j ACCEPT
|
||||||
sudo iptables -A PREROUTING -t nat -p tcp --dport 80 -j REDIRECT --to-port 5000
|
sudo iptables -A PREROUTING -t nat -p tcp --dport 80 -j REDIRECT --to-port 5000
|
||||||
|
EOF
|
||||||
# Confirm the blockexplorer is now globally accessible
|
|
||||||
curl --head "$(curl ifconfig.io)"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
args+=(--init-complete-file "$initCompleteFile")
|
args+=(--init-complete-file "$initCompleteFile")
|
||||||
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
|
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
|
||||||
args+=($extraNodeArgs)
|
args+=($extraNodeArgs)
|
||||||
nohup ./multinode-demo/validator.sh "${args[@]}" > fullnode.log 2>&1 &
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
pid=$!
|
nohup ./multinode-demo/validator.sh ${args[@]} > fullnode.log 2>&1 &
|
||||||
oom_score_adj "$pid" 1000
|
pid=\$!
|
||||||
|
oom_score_adj "\$pid" 1000
|
||||||
|
disown
|
||||||
|
EOF
|
||||||
|
~/solana/on-reboot
|
||||||
waitForNodeToInit
|
waitForNodeToInit
|
||||||
|
|
||||||
|
if [[ $nodeType = blockstreamer ]]; then
|
||||||
|
# Confirm the blockexplorer is accessible
|
||||||
|
curl --head --retry 3 --retry-connrefused http://localhost:5000/
|
||||||
|
|
||||||
|
# Confirm the blockexplorer is now globally accessible
|
||||||
|
curl --head "$(curl ifconfig.io)"
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $skipSetup != true && $nodeType != blockstreamer ]]; then
|
if [[ $skipSetup != true && $nodeType != blockstreamer ]]; then
|
||||||
args=(
|
args=(
|
||||||
--url http://"$entrypointIp":8899
|
--url http://"$entrypointIp":8899
|
||||||
|
@ -289,7 +320,8 @@ EOF
|
||||||
./multinode-demo/delegate-stake.sh "${args[@]}"
|
./multinode-demo/delegate-stake.sh "${args[@]}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
|
solana --url http://"$entrypointIp":8899 \
|
||||||
|
--keypair ~/solana/fullnode-identity.json \
|
||||||
validator-info publish "$(hostname)" -n team/solana --force || true
|
validator-info publish "$(hostname)" -n team/solana --force || true
|
||||||
;;
|
;;
|
||||||
replicator)
|
replicator)
|
||||||
|
@ -308,9 +340,13 @@ EOF
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
nohup ./multinode-demo/replicator.sh "${args[@]}" > fullnode.log 2>&1 &
|
cat >> ~/solana/on-reboot <<EOF
|
||||||
pid=$!
|
nohup ./multinode-demo/replicator.sh ${args[@]} > fullnode.log 2>&1 &
|
||||||
oom_score_adj "$pid" 1000
|
pid=\$!
|
||||||
|
oom_score_adj "\$pid" 1000
|
||||||
|
disown
|
||||||
|
EOF
|
||||||
|
~/solana/on-reboot
|
||||||
sleep 1
|
sleep 1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
@ -318,9 +354,9 @@ EOF
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
disown
|
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown deployment method: $deployMethod"
|
echo "Unknown deployment method: $deployMethod"
|
||||||
exit 1
|
exit 1
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
|
|
@ -167,8 +167,6 @@ cloud_CreateInstances() {
|
||||||
--tags testnet
|
--tags testnet
|
||||||
--metadata "testnet=$networkName"
|
--metadata "testnet=$networkName"
|
||||||
--image "$imageName"
|
--image "$imageName"
|
||||||
--maintenance-policy TERMINATE
|
|
||||||
--no-restart-on-failure
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args
|
# shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args
|
||||||
|
|
Loading…
Reference in New Issue