Add support more more than 1 client node

This commit is contained in:
Michael Vines 2018-07-23 14:49:51 -07:00
parent 7d68b6edc8
commit 9de9379925
1 changed files with 176 additions and 112 deletions

View File

@ -51,120 +51,144 @@ echo "Network entrypoint URL: $publicUrl ($publicIp)"
echo "Snap channel: $SOLANA_SNAP_CHANNEL"
leaderName=${publicUrl//./-}
vmlist=()
vmlist=() # Each array element is the triple "class:vmName:vmZone"
#
# vm_foreach_in_class [class] [cmd]
# where
# class - the desired VM class to operate on
# cmd - the command to execute on each VM in the desired class.
# The command will receive three arguments:
# vmName - GCP name of the VM
# vmZone - The GCP zone the VM is located in
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
#
#
vm_foreach_in_class() {
declare class=$1
declare cmd=$2
declare count=1
for info in "${vmlist[@]}"; do
declare vmClass vmName vmZone
IFS=: read -r vmClass vmName vmZone < <(echo "$info")
if [[ $class = "$vmClass" ]]; then
eval "$cmd" "$vmName" "$vmZone" "$count"
count=$((count + 1))
fi
done
}
findVms() {
declare filter="$1"
declare class="$1"
declare filter="$2"
gcloud compute instances list --filter="$filter"
while read -r vmName vmZone status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $vmName is not RUNNING, ignoring it."
continue
fi
vmlist+=("$vmName:$vmZone")
vmlist+=("$class:$vmName:$vmZone")
done < <(gcloud compute instances list --filter="$filter" --format 'value(name,zone,status)')
}
wait_for_node() {
declare pid=$1
declare ok=true
wait "$pid" || ok=false
cat "log-$pid.txt"
if ! $ok; then
echo ^^^ +++
exit 1
fi
}
echo "Leader node:"
findVms "name=$leaderName"
findVms leader "name=$leaderName"
[[ ${#vmlist[@]} = 1 ]] || {
echo "Unable to find $leaderName"
exit 1
}
echo "Client node:"
findVms "name=$leaderName-client"
clientVm=
if [[ ${#vmlist[@]} = 2 ]]; then
clientVm=${vmlist[1]}
unset 'vmlist[1]'
fi
echo "Client node(s):"
findVms client "name~^$leaderName-client"
echo "Validator nodes:"
findVms "name~^$leaderName-validator-"
if ! $ROLLING_UPDATE; then
count=1
for info in "${vmlist[@]}"; do
nodePosition="($count/${#vmlist[*]})"
vmName=${info%:*}
vmZone=${info#*:}
echo "--- Shutting down $vmName in zone $vmZone $nodePosition"
gcloud compute ssh "$vmName" --zone "$vmZone" \
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
--command="sudo snap remove solana" &
if [[ $((count % 5)) = 0 ]]; then
# Slow down deployment to avoid triggering GCP login
# quota limits (each |ssh| counts as a login)
sleep 3
fi
count=$((count + 1))
done
wait
fi
findVms validator "name~^$leaderName-validator-"
# Add "network stopping" datapoint
netName=${SOLANA_NET_URL%testnet.solana.com}
netName=${netName:0:8}
ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" stop=1"
client_run() {
declare message=$1
declare cmd=$2
[[ -n $clientVm ]] || return 0;
vmName=${clientVm%:*}
vmZone=${clientVm#*:}
gcp_vm_exec() {
declare vmName=$1
declare vmZone=$2
declare message=$3
declare cmd=$4
echo "--- $message $vmName in zone $vmZone"
gcloud compute ssh "$vmName" --zone "$vmZone" \
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
--command="$cmd"
}
client_run \
"Shutting down" \
"\
set -x;
tmux list-sessions; \
tmux capture-pane -t solana -p; \
tmux kill-session -t solana; \
sudo snap remove solana; \
gcp_login_quota_workaround() {
declare count=$1
if [[ $((count % 5)) = 0 ]]; then
# Slow down deployment to avoid triggering GCP login
# quota limits (each |ssh| counts as a login)
sleep 3
fi
}
client_start() {
declare vmName=$1
declare vmZone=$2
declare count=$3
gcp_vm_exec "$vmName" "$vmZone" \
"Starting client $count:" \
"\
set -x;
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \
sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG; \
snap info solana; \
tmux new -s solana -d \" \
/snap/bin/solana.bench-tps $SOLANA_NET_URL ${#vmlist[@]} --loop 2>&1 | tee /tmp/solana.log; \
echo Error: bench-tps should never exit; \
bash \
\"; \
sleep 2; \
tmux capture-pane -t solana -p -S -100; \
tail /tmp/solana.log; \
"
}
echo "--- Refreshing leader"
leader=true
pids=()
count=1
for info in "${vmlist[@]}"; do
nodePosition="($count/${#vmlist[*]})"
client_stop() {
declare vmName=$1
declare vmZone=$2
declare count=$3
vmName=${info%:*}
vmZone=${info#*:}
echo "Starting refresh for $vmName $nodePosition"
gcp_vm_exec "$vmName" "$vmZone" \
"Stopping client $count:" \
"\
set -x;
tmux list-sessions; \
tmux capture-pane -t solana -p; \
tmux kill-session -t solana; \
sudo snap remove solana; \
"
}
fullnode_start() {
declare class=$1
declare vmName=$2
declare vmZone=$3
declare count=$4
(
SECONDS=0
echo "--- $vmName in zone $vmZone $nodePosition"
commonNodeConfig="\
rust-log=$RUST_LOG \
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
metrics-config=$SOLANA_METRICS_CONFIG \
"
if $leader; then
if [[ $class = leader ]]; then
nodeConfig="mode=leader+drone $commonNodeConfig"
if [[ -n $SOLANA_CUDA ]]; then
nodeConfig="$nodeConfig enable-cuda=1"
@ -174,9 +198,8 @@ for info in "${vmlist[@]}"; do
fi
set -x
gcloud compute ssh "$vmName" --zone "$vmZone" \
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -t" \
--command="\
gcp_vm_exec "$vmName" "$vmZone" "Starting $class $count:" \
"\
set -ex; \
logmarker='solana deploy $(date)/$RANDOM'; \
sudo snap remove solana; \
@ -190,55 +213,96 @@ for info in "${vmlist[@]}"; do
"
echo "Succeeded in ${SECONDS} seconds"
) > "log-$vmName.txt" 2>&1 &
pid=$!
declare pid=$!
# Rename log file so it can be discovered later by $pid
while [[ ! -f "log-$vmName.txt" ]]; do
sleep 1
done
mv "log-$vmName.txt" "log-$pid.txt"
if $leader; then
echo Waiting for leader...
# Wait for the leader to initialize before starting the validators
# TODO: Remove this limitation eventually.
wait_for_node "$pid"
gcp_login_quota_workaround "$count"
pids+=("$pid")
}
echo "--- Refreshing validators"
else
# Slow down deployment to ~20 machines a minute to avoid triggering GCP login
# quota limits (each |ssh| counts as a login)
sleep 3
leader_start() {
fullnode_start leader "$@"
}
pids+=("$pid")
fi
leader=false
count=$((count + 1))
done
validator_start() {
fullnode_start validator "$@"
}
echo --- Waiting for validators
for pid in "${pids[@]}"; do
wait_for_node "$pid"
done
fullnode_stop() {
declare vmName=$1
declare vmZone=$2
declare count=$3
(
SECONDS=0
gcp_vm_exec "$vmName" "$vmZone" "Shutting down" "sudo snap remove solana"
echo "Succeeded in ${SECONDS} seconds"
) > "log-$vmName.txt" 2>&1 &
declare pid=$!
# Rename log file so it can be discovered later by $pid
while [[ ! -f "log-$vmName.txt" ]]; do
sleep 1
done
mv "log-$vmName.txt" "log-$pid.txt"
gcp_login_quota_workaround "$count"
pids+=("$pid")
}
wait_for_pids() {
echo "--- Waiting for $*"
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
cat "log-$pid.txt"
if ! $ok; then
echo ^^^ +++
exit 1
fi
done
}
vm_foreach_in_class client client_stop
if ! $ROLLING_UPDATE; then
pids=()
echo "--- Shutting down all full nodes"
vm_foreach_in_class leader fullnode_stop
vm_foreach_in_class validator fullnode_stop
wait_for_pids fullnode shutdown
fi
pids=()
echo --- Starting leader node
vm_foreach_in_class leader leader_start
wait_for_pids leader
pids=()
echo --- Starting validator nodes
vm_foreach_in_class validator validator_start
wait_for_pids validators
echo "--- $publicUrl sanity test"
(
fullnode_count=0
inc_fullnode_count() {
fullnode_count=$((fullnode_count + 1))
}
vm_foreach_in_class leader inc_fullnode_count
vm_foreach_in_class validator inc_fullnode_count
set -x
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl ${#vmlist[@]}
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl $fullnode_count
)
client_run \
"Starting client on " \
"\
set -x;
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \
sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG; \
snap info solana; \
tmux new -s solana -d \" \
/snap/bin/solana.bench-tps $SOLANA_NET_URL ${#vmlist[@]} --loop 2>&1 | tee /tmp/solana.log; \
echo Error: bench-tps should never exit; \
bash \
\"; \
sleep 2; \
tmux capture-pane -t solana -p -S -100; \
tail /tmp/solana.log; \
"
vm_foreach_in_class client client_start
# Add "network started" datapoint
ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" start=1"