Plumb pre-emptibility and associated overrides into colo allocation and automated testing (#8754)
automerge
This commit is contained in:
parent
496999beba
commit
595c96b262
64
net/gce.sh
64
net/gce.sh
|
@ -61,7 +61,6 @@ colo)
|
|||
;;
|
||||
esac
|
||||
|
||||
|
||||
prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
|
||||
additionalValidatorCount=2
|
||||
clientNodeCount=0
|
||||
|
@ -154,6 +153,7 @@ Manage testnet instances
|
|||
(by default preemptible instances are used to reduce
|
||||
cost). Note that the bootstrap validator, archiver,
|
||||
blockstreamer and client nodes are always dedicated.
|
||||
Set this flag on colo to prevent your testnet from being pre-empted by nightly test automation.
|
||||
--self-destruct-hours [number]
|
||||
- Specify lifetime of the allocated instances in hours. 0 to
|
||||
disable. Only supported on GCE. (default: $selfDestructHours)
|
||||
|
@ -162,7 +162,11 @@ Manage testnet instances
|
|||
-P - Use public network IP addresses (default: $publicNetwork)
|
||||
|
||||
delete-specific options:
|
||||
none
|
||||
--reclaim-preemptible-reservations
|
||||
- If set, reclaims all reservations on colo nodes that were not created with --dedicated.
|
||||
This behavior does not filter by testnet name or owner. Only implemented on colo.
|
||||
--reclaim-all-reservations
|
||||
- If set, reclaims all reservations on all colo nodes, regardless of owner, pre-emptibility, or creator.
|
||||
|
||||
info-specific options:
|
||||
--eval - Output in a form that can be eval-ed by a shell: eval $(gce.sh info)
|
||||
|
@ -215,6 +219,12 @@ while [[ -n $1 ]]; do
|
|||
usage 1
|
||||
fi
|
||||
shift 2
|
||||
elif [[ $1 == --reclaim-preemptible-reservations ]]; then
|
||||
reclaimOnlyPreemptibleReservations=true
|
||||
shift
|
||||
elif [[ $1 == --reclaim-all-reservations ]]; then
|
||||
reclaimAllReservations=true
|
||||
shift
|
||||
else
|
||||
usage "Unknown long option: $1"
|
||||
fi
|
||||
|
@ -308,6 +318,26 @@ ec2|azure|colo)
|
|||
;;
|
||||
esac
|
||||
|
||||
case $cloudProvider in
|
||||
gce | ec2 | azure)
|
||||
maybePreemptible="never preemptible"
|
||||
;;
|
||||
colo)
|
||||
maybePreemptible=$preemptible
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unknown cloud provider: $cloudProvider"
|
||||
;;
|
||||
esac
|
||||
|
||||
if [[ $reclaimOnlyPreemptibleReservations == "true" && $reclaimAllReservations == "true" ]]; then
|
||||
usage "Cannot set both --reclaim-preemptible-reservations and --reclaim-all-reservations. Set one or none"
|
||||
fi
|
||||
|
||||
if [[ -n $reclaimAllReservations || -n $reclaimOnlyPreemptibleReservations ]]; then
|
||||
forceDelete="true"
|
||||
fi
|
||||
|
||||
# cloud_ForEachInstance [cmd] [extra args to cmd]
|
||||
#
|
||||
# Execute a command for each element in the `instances` array
|
||||
|
@ -594,16 +624,30 @@ EOF
|
|||
delete() {
|
||||
$metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
|
||||
|
||||
# Filter for all nodes
|
||||
filter="$prefix-"
|
||||
case $cloudProvider in
|
||||
gce | ec2 | azure)
|
||||
# Filter for all nodes
|
||||
filter="$prefix-"
|
||||
;;
|
||||
colo)
|
||||
if [[ -n $forceDelete ]]; then
|
||||
filter=".*-"
|
||||
else
|
||||
filter="$prefix-"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unknown cloud provider: $cloudProvider"
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "Searching for instances: $filter"
|
||||
cloud_FindInstances "$filter"
|
||||
cloud_FindInstances "$filter" "$reclaimOnlyPreemptibleReservations"
|
||||
|
||||
if [[ ${#instances[@]} -eq 0 ]]; then
|
||||
echo "No instances found matching '$filter'"
|
||||
else
|
||||
cloud_DeleteInstances true &
|
||||
cloud_DeleteInstances $forceDelete
|
||||
fi
|
||||
|
||||
wait
|
||||
|
@ -817,7 +861,7 @@ EOF
|
|||
cloud_CreateInstances "$prefix" "$prefix-bootstrap-validator" 1 \
|
||||
"$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$validatorBootDiskSizeInGb" \
|
||||
"$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$validatorAdditionalDiskSizeInGb" \
|
||||
"never preemptible" "$sshPrivateKey"
|
||||
"$maybePreemptible" "$sshPrivateKey"
|
||||
fi
|
||||
|
||||
if [[ $additionalValidatorCount -gt 0 ]]; then
|
||||
|
@ -847,19 +891,19 @@ EOF
|
|||
if [[ $clientNodeCount -gt 0 ]]; then
|
||||
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
|
||||
"$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \
|
||||
"$startupScript" "" "$bootDiskType" "" "never preemptible" "$sshPrivateKey"
|
||||
"$startupScript" "" "$bootDiskType" "" "$maybePreemptible" "$sshPrivateKey"
|
||||
fi
|
||||
|
||||
if $blockstreamer; then
|
||||
cloud_CreateInstances "$prefix" "$prefix-blockstreamer" "1" \
|
||||
"$enableGpu" "$blockstreamerMachineType" "${zones[0]}" "$validatorBootDiskSizeInGb" \
|
||||
"$startupScript" "$blockstreamerAddress" "$bootDiskType" "" "$sshPrivateKey"
|
||||
"$startupScript" "$blockstreamerAddress" "$bootDiskType" "" "$maybePreemptible" "$sshPrivateKey"
|
||||
fi
|
||||
|
||||
if [[ $archiverNodeCount -gt 0 ]]; then
|
||||
cloud_CreateInstances "$prefix" "$prefix-archiver" "$archiverNodeCount" \
|
||||
false "$archiverMachineType" "${zones[0]}" "$archiverBootDiskSizeInGb" \
|
||||
"$startupScript" "" "" "" "never preemptible" "$sshPrivateKey"
|
||||
"$startupScript" "" "" "" "$maybePreemptible" "$sshPrivateKey"
|
||||
fi
|
||||
|
||||
$metricsWriteDatapoint "testnet-deploy net-create-complete=1"
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
# These variable must be set before the main body is called
|
||||
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE:?}"
|
||||
INSTANCE_NAME="${INSTANCE_NAME:?}"
|
||||
PREEMPTIBLE="${PREEMPTIBLE:?}"
|
||||
SSH_AUTHORIZED_KEYS="${SSH_AUTHORIZED_KEYS:?}"
|
||||
SSH_PRIVATE_KEY_TEXT="${SSH_PRIVATE_KEY_TEXT:?}"
|
||||
SSH_PUBLIC_KEY_TEXT="${SSH_PUBLIC_KEY_TEXT:?}"
|
||||
|
@ -16,6 +17,7 @@ if [[ ! -f "${SOLANA_LOCK_FILE}" ]]; then
|
|||
{
|
||||
echo "export SOLANA_LOCK_USER=${SOLANA_USER}"
|
||||
echo "export SOLANA_LOCK_INSTANCENAME=${INSTANCE_NAME}"
|
||||
echo "export PREEMPTIBLE=${PREEMPTIBLE}"
|
||||
echo "[[ -v SSH_TTY && -f \"${HOME}/.solana-motd\" ]] && cat \"${HOME}/.solana-motd\" 1>&2"
|
||||
} >&9
|
||||
exec 9>&-
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE:?}"
|
||||
SECONDARY_DISK_MOUNT_POINT="${SECONDARY_DISK_MOUNT_POINT:?}"
|
||||
SSH_AUTHORIZED_KEYS="${SSH_AUTHORIZED_KEYS:?}"
|
||||
FORCE_DELETE="${FORCE_DELETE}"
|
||||
|
||||
RC=false
|
||||
if [[ -f "${SOLANA_LOCK_FILE}" ]]; then
|
||||
|
@ -11,7 +12,7 @@ if [[ -f "${SOLANA_LOCK_FILE}" ]]; then
|
|||
flock -x -n 9 || ( echo "Failed to acquire lock!" 1>&2 && exit 1 )
|
||||
# shellcheck disable=SC1090
|
||||
. "${SOLANA_LOCK_FILE}"
|
||||
if [[ "${SOLANA_LOCK_USER}" = "${SOLANA_USER}" ]]; then
|
||||
if [[ "${SOLANA_LOCK_USER}" = "${SOLANA_USER}" || -n "${FORCE_DELETE}" ]]; then
|
||||
# Begin running process cleanup
|
||||
CLEANUP_PID=$$
|
||||
CLEANUP_PIDS=()
|
||||
|
|
|
@ -39,6 +39,7 @@ cloud_RestartPreemptedInstances() {
|
|||
__cloud_FindInstances() {
|
||||
declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME INSTANCES_TEXT
|
||||
declare filter=${1}
|
||||
declare onlyPreemptible=${2}
|
||||
instances=()
|
||||
|
||||
if ! ${COLO_PARALLELIZE}; then
|
||||
|
@ -47,10 +48,14 @@ __cloud_FindInstances() {
|
|||
fi
|
||||
INSTANCES_TEXT="$(
|
||||
for AVAIL in "${COLO_RES_AVAILABILITY[@]}"; do
|
||||
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME <<<"${AVAIL}"
|
||||
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE <<<"${AVAIL}"
|
||||
if [[ ${INSTNAME} =~ ${filter} ]]; then
|
||||
printf "%-40s | publicIp=%-16s privateIp=%s zone=%s\n" "${INSTNAME}" "${IP}" "${PRIV_IP}" "${ZONE}" 1>&2
|
||||
echo -e "${INSTNAME}:${IP}:${PRIV_IP}:${ZONE}"
|
||||
if [[ -n $onlyPreemptible && $PREEMPTIBLE == "false" ]]; then
|
||||
continue
|
||||
else
|
||||
printf "%-40s | publicIp=%-16s privateIp=%s zone=%s preemptible=%s\n" "${INSTNAME}" "${IP}" "${PRIV_IP}" "${ZONE}" "${PREEMPTIBLE}" 1>&2
|
||||
echo -e "${INSTNAME}:${IP}:${PRIV_IP}:${ZONE}"
|
||||
fi
|
||||
fi
|
||||
done | sort -t $'\v' -k1
|
||||
)"
|
||||
|
@ -77,7 +82,8 @@ __cloud_FindInstances() {
|
|||
#
|
||||
cloud_FindInstances() {
|
||||
declare filter="^${1}.*"
|
||||
__cloud_FindInstances "${filter}"
|
||||
declare onlyPreemptible="${2}"
|
||||
__cloud_FindInstances "${filter}" "${onlyPreemptible}"
|
||||
}
|
||||
|
||||
#
|
||||
|
@ -96,7 +102,8 @@ cloud_FindInstances() {
|
|||
#
|
||||
cloud_FindInstance() {
|
||||
declare name="^${1}$"
|
||||
__cloud_FindInstances "${name}"
|
||||
declare onlyPreemptible="${2}"
|
||||
__cloud_FindInstances "${name}" "${onlyPreemptible}"
|
||||
}
|
||||
|
||||
#
|
||||
|
@ -155,7 +162,7 @@ cloud_CreateInstances() {
|
|||
#declare optionalAddress="${9}" # unused
|
||||
#declare optionalBootDiskType="${10}" # unused
|
||||
#declare optionalAdditionalDiskSize="${11}" # unused
|
||||
#declare optionalPreemptible="${12}" # unused
|
||||
declare optionalPreemptible="${12}"
|
||||
declare sshPrivateKey="${13}"
|
||||
|
||||
declare -a nodes
|
||||
|
@ -213,7 +220,7 @@ cloud_CreateInstances() {
|
|||
RES_MACH="${COLO_RES_MACHINE[${RI}]}"
|
||||
IP="${COLO_RES_IP[${RI}]}"
|
||||
if colo_machine_types_compatible "${RES_MACH}" "${machineType}"; then
|
||||
if colo_node_requisition "${IP}" "${node}" "${sshPrivateKey}" >/dev/null; then
|
||||
if colo_node_requisition "${IP}" "${node}" "${sshPrivateKey}" "${optionalPreemptible}" >/dev/null; then
|
||||
NI=$((NI+1))
|
||||
fi
|
||||
fi
|
||||
|
@ -228,10 +235,11 @@ cloud_CreateInstances() {
|
|||
# Deletes all the instances listed in the `instances` array
|
||||
#
|
||||
cloud_DeleteInstances() {
|
||||
declare forceDelete="${1}"
|
||||
declare _ IP _ _
|
||||
for instance in "${instances[@]}"; do
|
||||
IFS=':' read -r _ IP _ _ <<< "${instance}"
|
||||
colo_node_free "${IP}" >/dev/null
|
||||
colo_node_free "${IP}" "${forceDelete}" >/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
|
@ -270,13 +278,13 @@ cloud_FetchFile() {
|
|||
}
|
||||
|
||||
cloud_StatusAll() {
|
||||
declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME
|
||||
declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE
|
||||
if ! ${COLO_PARALLELIZE}; then
|
||||
colo_load_resources
|
||||
colo_load_availability false
|
||||
fi
|
||||
for AVAIL in "${COLO_RES_AVAILABILITY[@]}"; do
|
||||
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME <<<"${AVAIL}"
|
||||
printf "%-30s | publicIp=%-16s privateIp=%s status=%s who=%s zone=%s inst=%s\n" "${HOST_NAME}" "${IP}" "${PRIV_IP}" "${STATUS}" "${LOCK_USER}" "${ZONE}" "${INSTNAME}"
|
||||
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE <<<"${AVAIL}"
|
||||
printf "%-30s | publicIp=%-16s privateIp=%s status=%s who=%s zone=%s inst=%s preemptible=%s\n" "${HOST_NAME}" "${IP}" "${PRIV_IP}" "${STATUS}" "${LOCK_USER}" "${ZONE}" "${INSTNAME}" "${PREEMPTIBLE}"
|
||||
done
|
||||
}
|
||||
|
|
|
@ -46,17 +46,17 @@ declare COLO_RES_AVAILABILITY_CACHED=false
|
|||
declare -ax COLO_RES_AVAILABILITY
|
||||
colo_load_availability() {
|
||||
declare USE_CACHE=${1:-${COLO_RES_AVAILABILITY_CACHED}}
|
||||
declare LINE PRIV_IP STATUS LOCK_USER I IP HOST_NAME ZONE INSTNAME
|
||||
declare LINE PRIV_IP STATUS LOCK_USER I IP HOST_NAME ZONE INSTNAME PREEMPTIBLE
|
||||
if ! ${USE_CACHE}; then
|
||||
COLO_RES_AVAILABILITY=()
|
||||
COLO_RES_REQUISITIONED=()
|
||||
while read -r LINE; do
|
||||
IFS=$'\v' read -r IP STATUS LOCK_USER INSTNAME <<< "${LINE}"
|
||||
IFS=$'\v' read -r IP STATUS LOCK_USER INSTNAME PREEMPTIBLE <<< "${LINE}"
|
||||
I=$(colo_res_index_from_ip "${IP}")
|
||||
PRIV_IP="${COLO_RES_IP_PRIV[${I}]}"
|
||||
HOST_NAME="${COLO_RES_HOSTNAME[${I}]}"
|
||||
ZONE="${COLO_RES_ZONE[${I}]}"
|
||||
COLO_RES_AVAILABILITY+=( "$(echo -e "${HOST_NAME}\v${IP}\v${PRIV_IP}\v${STATUS}\v${ZONE}\v${LOCK_USER}\v${INSTNAME}")" )
|
||||
COLO_RES_AVAILABILITY+=( "$(echo -e "${HOST_NAME}\v${IP}\v${PRIV_IP}\v${STATUS}\v${ZONE}\v${LOCK_USER}\v${INSTNAME}\v${PREEMPTIBLE}")" )
|
||||
done < <(colo_node_status_all | sort -t $'\v' -k1)
|
||||
COLO_RES_AVAILABILITY_CACHED=true
|
||||
fi
|
||||
|
@ -142,15 +142,15 @@ __colo_node_status_script() {
|
|||
# the time due to ${SOLANA_LOCK_FILE} not existing and is running from a
|
||||
# subshell where normal redirection doesn't work
|
||||
exec 9<"${SOLANA_LOCK_FILE}" && flock -s 9 && . "${SOLANA_LOCK_FILE}" && exec 9>&-
|
||||
echo -e "\${SOLANA_LOCK_USER}\\v\${SOLANA_LOCK_INSTANCENAME}\\vEOL"
|
||||
echo -e "\${SOLANA_LOCK_USER}\\v\${SOLANA_LOCK_INSTANCENAME}\\v\${PREEMPTIBLE}\\vEOL"
|
||||
exec 2>&3 # Restore stderr
|
||||
EOF
|
||||
}
|
||||
|
||||
__colo_node_status_result_normalize() {
|
||||
declare IP RC US BY INSTNAME EOL
|
||||
declare IP RC US BY INSTNAME PREEMPTIBLE EOL
|
||||
declare ST="DOWN"
|
||||
IFS=$'\v' read -r IP RC US INSTNAME EOL <<< "${1}"
|
||||
IFS=$'\v' read -r IP RC US INSTNAME PREEMPTIBLE EOL <<< "${1}"
|
||||
if [ "${RC}" -eq 0 ]; then
|
||||
[[ "${EOL}" = "EOL" ]] || echo "${FUNCNAME[0]}: Unexpected input \"${1}\"" 1>&2
|
||||
if [ -n "${US}" ]; then
|
||||
|
@ -163,7 +163,7 @@ __colo_node_status_result_normalize() {
|
|||
ST="FREE"
|
||||
fi
|
||||
fi
|
||||
echo -e $"${IP}\v${ST}\v${BY}\v${INSTNAME}"
|
||||
echo -e $"${IP}\v${ST}\v${BY}\v${INSTNAME}\v${PREEMPTIBLE}"
|
||||
}
|
||||
|
||||
colo_node_status() {
|
||||
|
@ -188,6 +188,7 @@ colo_node_requisition() {
|
|||
declare INSTANCE_NAME=${2}
|
||||
# shellcheck disable=SC2034
|
||||
declare SSH_PRIVATE_KEY="${3}"
|
||||
declare PREEMPTIBLE="${4}"
|
||||
|
||||
declare INDEX
|
||||
INDEX=$(colo_res_index_from_ip "${IP}")
|
||||
|
@ -196,6 +197,7 @@ colo_node_requisition() {
|
|||
colo_instance_run "${IP}" "$(cat <<EOF
|
||||
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE}"
|
||||
INSTANCE_NAME="${INSTANCE_NAME}"
|
||||
PREEMPTIBLE="${PREEMPTIBLE}"
|
||||
SSH_AUTHORIZED_KEYS='$("${__colo_here}"/add-datacenter-solana-user-authorized_keys.sh 2> /dev/null)'
|
||||
SSH_PRIVATE_KEY_TEXT="$(<"${SSH_PRIVATE_KEY}")"
|
||||
SSH_PUBLIC_KEY_TEXT="$(<"${SSH_PRIVATE_KEY}.pub")"
|
||||
|
@ -238,10 +240,12 @@ colo_machine_types_compatible() {
|
|||
|
||||
colo_node_free() {
|
||||
declare IP=${1}
|
||||
declare FORCE_DELETE=${2}
|
||||
colo_instance_run "${IP}" "$(cat <<EOF
|
||||
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE}"
|
||||
SECONDARY_DISK_MOUNT_POINT="${SECONDARY_DISK_MOUNT_POINT}"
|
||||
SSH_AUTHORIZED_KEYS='$("${__colo_here}"/add-datacenter-solana-user-authorized_keys.sh 2> /dev/null)'
|
||||
FORCE_DELETE="${FORCE_DELETE}"
|
||||
$(<"${__colo_here}"/colo-node-onfree.sh)
|
||||
EOF
|
||||
)"
|
||||
|
|
|
@ -160,6 +160,7 @@ function launchTestnet() {
|
|||
${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
;;
|
||||
colo)
|
||||
net/colo.sh delete --reclaim-preemptible-reservations
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/colo.sh create \
|
||||
|
|
Loading…
Reference in New Issue