Add support more more than 1 client node
This commit is contained in:
parent
7d68b6edc8
commit
9de9379925
|
@ -51,120 +51,144 @@ echo "Network entrypoint URL: $publicUrl ($publicIp)"
|
||||||
echo "Snap channel: $SOLANA_SNAP_CHANNEL"
|
echo "Snap channel: $SOLANA_SNAP_CHANNEL"
|
||||||
|
|
||||||
leaderName=${publicUrl//./-}
|
leaderName=${publicUrl//./-}
|
||||||
vmlist=()
|
|
||||||
|
vmlist=() # Each array element is the triple "class:vmName:vmZone"
|
||||||
|
|
||||||
|
#
|
||||||
|
# vm_foreach_in_class [class] [cmd]
|
||||||
|
# where
|
||||||
|
# class - the desired VM class to operate on
|
||||||
|
# cmd - the command to execute on each VM in the desired class.
|
||||||
|
# The command will receive three arguments:
|
||||||
|
# vmName - GCP name of the VM
|
||||||
|
# vmZone - The GCP zone the VM is located in
|
||||||
|
# count - Monotonically increasing count for each
|
||||||
|
# invocation of cmd, starting at 1
|
||||||
|
#
|
||||||
|
#
|
||||||
|
vm_foreach_in_class() {
|
||||||
|
declare class=$1
|
||||||
|
declare cmd=$2
|
||||||
|
|
||||||
|
declare count=1
|
||||||
|
for info in "${vmlist[@]}"; do
|
||||||
|
declare vmClass vmName vmZone
|
||||||
|
IFS=: read -r vmClass vmName vmZone < <(echo "$info")
|
||||||
|
|
||||||
|
if [[ $class = "$vmClass" ]]; then
|
||||||
|
eval "$cmd" "$vmName" "$vmZone" "$count"
|
||||||
|
count=$((count + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
findVms() {
|
findVms() {
|
||||||
declare filter="$1"
|
declare class="$1"
|
||||||
|
declare filter="$2"
|
||||||
gcloud compute instances list --filter="$filter"
|
gcloud compute instances list --filter="$filter"
|
||||||
while read -r vmName vmZone status; do
|
while read -r vmName vmZone status; do
|
||||||
if [[ $status != RUNNING ]]; then
|
if [[ $status != RUNNING ]]; then
|
||||||
echo "Warning: $vmName is not RUNNING, ignoring it."
|
echo "Warning: $vmName is not RUNNING, ignoring it."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
vmlist+=("$vmName:$vmZone")
|
vmlist+=("$class:$vmName:$vmZone")
|
||||||
done < <(gcloud compute instances list --filter="$filter" --format 'value(name,zone,status)')
|
done < <(gcloud compute instances list --filter="$filter" --format 'value(name,zone,status)')
|
||||||
}
|
}
|
||||||
|
|
||||||
wait_for_node() {
|
|
||||||
declare pid=$1
|
|
||||||
|
|
||||||
declare ok=true
|
|
||||||
wait "$pid" || ok=false
|
|
||||||
cat "log-$pid.txt"
|
|
||||||
if ! $ok; then
|
|
||||||
echo ^^^ +++
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
echo "Leader node:"
|
echo "Leader node:"
|
||||||
findVms "name=$leaderName"
|
findVms leader "name=$leaderName"
|
||||||
[[ ${#vmlist[@]} = 1 ]] || {
|
[[ ${#vmlist[@]} = 1 ]] || {
|
||||||
echo "Unable to find $leaderName"
|
echo "Unable to find $leaderName"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "Client node:"
|
echo "Client node(s):"
|
||||||
findVms "name=$leaderName-client"
|
findVms client "name~^$leaderName-client"
|
||||||
clientVm=
|
|
||||||
if [[ ${#vmlist[@]} = 2 ]]; then
|
|
||||||
clientVm=${vmlist[1]}
|
|
||||||
unset 'vmlist[1]'
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Validator nodes:"
|
echo "Validator nodes:"
|
||||||
findVms "name~^$leaderName-validator-"
|
findVms validator "name~^$leaderName-validator-"
|
||||||
|
|
||||||
if ! $ROLLING_UPDATE; then
|
|
||||||
count=1
|
|
||||||
for info in "${vmlist[@]}"; do
|
|
||||||
nodePosition="($count/${#vmlist[*]})"
|
|
||||||
vmName=${info%:*}
|
|
||||||
vmZone=${info#*:}
|
|
||||||
echo "--- Shutting down $vmName in zone $vmZone $nodePosition"
|
|
||||||
gcloud compute ssh "$vmName" --zone "$vmZone" \
|
|
||||||
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
|
|
||||||
--command="sudo snap remove solana" &
|
|
||||||
|
|
||||||
if [[ $((count % 5)) = 0 ]]; then
|
|
||||||
# Slow down deployment to avoid triggering GCP login
|
|
||||||
# quota limits (each |ssh| counts as a login)
|
|
||||||
sleep 3
|
|
||||||
fi
|
|
||||||
|
|
||||||
count=$((count + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
wait
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Add "network stopping" datapoint
|
# Add "network stopping" datapoint
|
||||||
netName=${SOLANA_NET_URL%testnet.solana.com}
|
netName=${SOLANA_NET_URL%testnet.solana.com}
|
||||||
netName=${netName:0:8}
|
netName=${netName:0:8}
|
||||||
ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" stop=1"
|
ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" stop=1"
|
||||||
|
|
||||||
client_run() {
|
gcp_vm_exec() {
|
||||||
declare message=$1
|
declare vmName=$1
|
||||||
declare cmd=$2
|
declare vmZone=$2
|
||||||
[[ -n $clientVm ]] || return 0;
|
declare message=$3
|
||||||
vmName=${clientVm%:*}
|
declare cmd=$4
|
||||||
vmZone=${clientVm#*:}
|
|
||||||
echo "--- $message $vmName in zone $vmZone"
|
echo "--- $message $vmName in zone $vmZone"
|
||||||
gcloud compute ssh "$vmName" --zone "$vmZone" \
|
gcloud compute ssh "$vmName" --zone "$vmZone" \
|
||||||
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
|
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
|
||||||
--command="$cmd"
|
--command="$cmd"
|
||||||
}
|
}
|
||||||
|
|
||||||
client_run \
|
gcp_login_quota_workaround() {
|
||||||
"Shutting down" \
|
declare count=$1
|
||||||
"\
|
|
||||||
set -x;
|
if [[ $((count % 5)) = 0 ]]; then
|
||||||
tmux list-sessions; \
|
# Slow down deployment to avoid triggering GCP login
|
||||||
tmux capture-pane -t solana -p; \
|
# quota limits (each |ssh| counts as a login)
|
||||||
tmux kill-session -t solana; \
|
sleep 3
|
||||||
sudo snap remove solana; \
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
client_start() {
|
||||||
|
declare vmName=$1
|
||||||
|
declare vmZone=$2
|
||||||
|
declare count=$3
|
||||||
|
|
||||||
|
gcp_vm_exec "$vmName" "$vmZone" \
|
||||||
|
"Starting client $count:" \
|
||||||
|
"\
|
||||||
|
set -x;
|
||||||
|
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \
|
||||||
|
sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG; \
|
||||||
|
snap info solana; \
|
||||||
|
tmux new -s solana -d \" \
|
||||||
|
/snap/bin/solana.bench-tps $SOLANA_NET_URL ${#vmlist[@]} --loop 2>&1 | tee /tmp/solana.log; \
|
||||||
|
echo Error: bench-tps should never exit; \
|
||||||
|
bash \
|
||||||
|
\"; \
|
||||||
|
sleep 2; \
|
||||||
|
tmux capture-pane -t solana -p -S -100; \
|
||||||
|
tail /tmp/solana.log; \
|
||||||
"
|
"
|
||||||
|
}
|
||||||
|
|
||||||
echo "--- Refreshing leader"
|
client_stop() {
|
||||||
leader=true
|
declare vmName=$1
|
||||||
pids=()
|
declare vmZone=$2
|
||||||
count=1
|
declare count=$3
|
||||||
for info in "${vmlist[@]}"; do
|
|
||||||
nodePosition="($count/${#vmlist[*]})"
|
|
||||||
|
|
||||||
vmName=${info%:*}
|
gcp_vm_exec "$vmName" "$vmZone" \
|
||||||
vmZone=${info#*:}
|
"Stopping client $count:" \
|
||||||
echo "Starting refresh for $vmName $nodePosition"
|
"\
|
||||||
|
set -x;
|
||||||
|
tmux list-sessions; \
|
||||||
|
tmux capture-pane -t solana -p; \
|
||||||
|
tmux kill-session -t solana; \
|
||||||
|
sudo snap remove solana; \
|
||||||
|
"
|
||||||
|
}
|
||||||
|
|
||||||
|
fullnode_start() {
|
||||||
|
declare class=$1
|
||||||
|
declare vmName=$2
|
||||||
|
declare vmZone=$3
|
||||||
|
declare count=$4
|
||||||
|
|
||||||
(
|
(
|
||||||
SECONDS=0
|
SECONDS=0
|
||||||
echo "--- $vmName in zone $vmZone $nodePosition"
|
|
||||||
commonNodeConfig="\
|
commonNodeConfig="\
|
||||||
rust-log=$RUST_LOG \
|
rust-log=$RUST_LOG \
|
||||||
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
|
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
|
||||||
metrics-config=$SOLANA_METRICS_CONFIG \
|
metrics-config=$SOLANA_METRICS_CONFIG \
|
||||||
"
|
"
|
||||||
if $leader; then
|
if [[ $class = leader ]]; then
|
||||||
nodeConfig="mode=leader+drone $commonNodeConfig"
|
nodeConfig="mode=leader+drone $commonNodeConfig"
|
||||||
if [[ -n $SOLANA_CUDA ]]; then
|
if [[ -n $SOLANA_CUDA ]]; then
|
||||||
nodeConfig="$nodeConfig enable-cuda=1"
|
nodeConfig="$nodeConfig enable-cuda=1"
|
||||||
|
@ -174,9 +198,8 @@ for info in "${vmlist[@]}"; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
gcloud compute ssh "$vmName" --zone "$vmZone" \
|
gcp_vm_exec "$vmName" "$vmZone" "Starting $class $count:" \
|
||||||
--ssh-flag="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -t" \
|
"\
|
||||||
--command="\
|
|
||||||
set -ex; \
|
set -ex; \
|
||||||
logmarker='solana deploy $(date)/$RANDOM'; \
|
logmarker='solana deploy $(date)/$RANDOM'; \
|
||||||
sudo snap remove solana; \
|
sudo snap remove solana; \
|
||||||
|
@ -190,55 +213,96 @@ for info in "${vmlist[@]}"; do
|
||||||
"
|
"
|
||||||
echo "Succeeded in ${SECONDS} seconds"
|
echo "Succeeded in ${SECONDS} seconds"
|
||||||
) > "log-$vmName.txt" 2>&1 &
|
) > "log-$vmName.txt" 2>&1 &
|
||||||
pid=$!
|
declare pid=$!
|
||||||
|
|
||||||
# Rename log file so it can be discovered later by $pid
|
# Rename log file so it can be discovered later by $pid
|
||||||
|
while [[ ! -f "log-$vmName.txt" ]]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
mv "log-$vmName.txt" "log-$pid.txt"
|
mv "log-$vmName.txt" "log-$pid.txt"
|
||||||
|
|
||||||
if $leader; then
|
gcp_login_quota_workaround "$count"
|
||||||
echo Waiting for leader...
|
pids+=("$pid")
|
||||||
# Wait for the leader to initialize before starting the validators
|
}
|
||||||
# TODO: Remove this limitation eventually.
|
|
||||||
wait_for_node "$pid"
|
|
||||||
|
|
||||||
echo "--- Refreshing validators"
|
leader_start() {
|
||||||
else
|
fullnode_start leader "$@"
|
||||||
# Slow down deployment to ~20 machines a minute to avoid triggering GCP login
|
}
|
||||||
# quota limits (each |ssh| counts as a login)
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
pids+=("$pid")
|
validator_start() {
|
||||||
fi
|
fullnode_start validator "$@"
|
||||||
leader=false
|
}
|
||||||
count=$((count + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
echo --- Waiting for validators
|
fullnode_stop() {
|
||||||
for pid in "${pids[@]}"; do
|
declare vmName=$1
|
||||||
wait_for_node "$pid"
|
declare vmZone=$2
|
||||||
done
|
declare count=$3
|
||||||
|
|
||||||
|
(
|
||||||
|
SECONDS=0
|
||||||
|
gcp_vm_exec "$vmName" "$vmZone" "Shutting down" "sudo snap remove solana"
|
||||||
|
echo "Succeeded in ${SECONDS} seconds"
|
||||||
|
) > "log-$vmName.txt" 2>&1 &
|
||||||
|
declare pid=$!
|
||||||
|
|
||||||
|
# Rename log file so it can be discovered later by $pid
|
||||||
|
while [[ ! -f "log-$vmName.txt" ]]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
mv "log-$vmName.txt" "log-$pid.txt"
|
||||||
|
|
||||||
|
gcp_login_quota_workaround "$count"
|
||||||
|
pids+=("$pid")
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_pids() {
|
||||||
|
echo "--- Waiting for $*"
|
||||||
|
for pid in "${pids[@]}"; do
|
||||||
|
declare ok=true
|
||||||
|
wait "$pid" || ok=false
|
||||||
|
cat "log-$pid.txt"
|
||||||
|
if ! $ok; then
|
||||||
|
echo ^^^ +++
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
vm_foreach_in_class client client_stop
|
||||||
|
|
||||||
|
if ! $ROLLING_UPDATE; then
|
||||||
|
pids=()
|
||||||
|
echo "--- Shutting down all full nodes"
|
||||||
|
vm_foreach_in_class leader fullnode_stop
|
||||||
|
vm_foreach_in_class validator fullnode_stop
|
||||||
|
wait_for_pids fullnode shutdown
|
||||||
|
fi
|
||||||
|
|
||||||
|
pids=()
|
||||||
|
echo --- Starting leader node
|
||||||
|
vm_foreach_in_class leader leader_start
|
||||||
|
wait_for_pids leader
|
||||||
|
|
||||||
|
pids=()
|
||||||
|
echo --- Starting validator nodes
|
||||||
|
vm_foreach_in_class validator validator_start
|
||||||
|
wait_for_pids validators
|
||||||
|
|
||||||
echo "--- $publicUrl sanity test"
|
echo "--- $publicUrl sanity test"
|
||||||
(
|
(
|
||||||
|
fullnode_count=0
|
||||||
|
inc_fullnode_count() {
|
||||||
|
fullnode_count=$((fullnode_count + 1))
|
||||||
|
}
|
||||||
|
vm_foreach_in_class leader inc_fullnode_count
|
||||||
|
vm_foreach_in_class validator inc_fullnode_count
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl ${#vmlist[@]}
|
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl $fullnode_count
|
||||||
)
|
)
|
||||||
|
|
||||||
client_run \
|
vm_foreach_in_class client client_start
|
||||||
"Starting client on " \
|
|
||||||
"\
|
|
||||||
set -x;
|
|
||||||
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode; \
|
|
||||||
sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG; \
|
|
||||||
snap info solana; \
|
|
||||||
tmux new -s solana -d \" \
|
|
||||||
/snap/bin/solana.bench-tps $SOLANA_NET_URL ${#vmlist[@]} --loop 2>&1 | tee /tmp/solana.log; \
|
|
||||||
echo Error: bench-tps should never exit; \
|
|
||||||
bash \
|
|
||||||
\"; \
|
|
||||||
sleep 2; \
|
|
||||||
tmux capture-pane -t solana -p -S -100; \
|
|
||||||
tail /tmp/solana.log; \
|
|
||||||
"
|
|
||||||
|
|
||||||
# Add "network started" datapoint
|
# Add "network started" datapoint
|
||||||
ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" start=1"
|
ci/metrics_write_datapoint.sh "testnet-deploy,name=\"$netName\" start=1"
|
||||||
|
|
Loading…
Reference in New Issue