From 595c96b26221c735bfb5654973616887755be10b Mon Sep 17 00:00:00 2001 From: Dan Albert Date: Tue, 10 Mar 2020 11:25:44 -0700 Subject: [PATCH] Plumb pre-emptibility and associated overrides into colo allocation and automated testing (#8754) automerge --- net/gce.sh | 64 +++++++++++++++++++++++++----- net/scripts/colo-node-onacquire.sh | 2 + net/scripts/colo-node-onfree.sh | 3 +- net/scripts/colo-provider.sh | 30 +++++++++----- net/scripts/colo-utils.sh | 18 +++++---- system-test/testnet-automation.sh | 1 + 6 files changed, 89 insertions(+), 29 deletions(-) diff --git a/net/gce.sh b/net/gce.sh index a46950fb9..a9dbcafbf 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -61,7 +61,6 @@ colo) ;; esac - prefix=testnet-dev-${USER//[^A-Za-z0-9]/} additionalValidatorCount=2 clientNodeCount=0 @@ -154,6 +153,7 @@ Manage testnet instances (by default preemptible instances are used to reduce cost). Note that the bootstrap validator, archiver, blockstreamer and client nodes are always dedicated. + Set this flag on colo to prevent your testnet from being pre-empted by nightly test automation. --self-destruct-hours [number] - Specify lifetime of the allocated instances in hours. 0 to disable. Only supported on GCE. (default: $selfDestructHours) @@ -162,7 +162,11 @@ Manage testnet instances -P - Use public network IP addresses (default: $publicNetwork) delete-specific options: - none + --reclaim-preemptible-reservations + - If set, reclaims all reservations on colo nodes that were not created with --dedicated. + This behavior does not filter by testnet name or owner. Only implemented on colo. + --reclaim-all-reservations + - If set, reclaims all reservations on all colo nodes, regardless of owner, pre-emptibility, or creator. info-specific options: --eval - Output in a form that can be eval-ed by a shell: eval $(gce.sh info) @@ -215,6 +219,12 @@ while [[ -n $1 ]]; do usage 1 fi shift 2 + elif [[ $1 == --reclaim-preemptible-reservations ]]; then + reclaimOnlyPreemptibleReservations=true + shift + elif [[ $1 == --reclaim-all-reservations ]]; then + reclaimAllReservations=true + shift else usage "Unknown long option: $1" fi @@ -308,6 +318,26 @@ ec2|azure|colo) ;; esac +case $cloudProvider in + gce | ec2 | azure) + maybePreemptible="never preemptible" + ;; + colo) + maybePreemptible=$preemptible + ;; + *) + echo "Error: Unknown cloud provider: $cloudProvider" + ;; +esac + +if [[ $reclaimOnlyPreemptibleReservations == "true" && $reclaimAllReservations == "true" ]]; then + usage "Cannot set both --reclaim-preemptible-reservations and --reclaim-all-reservations. Set one or none" +fi + +if [[ -n $reclaimAllReservations || -n $reclaimOnlyPreemptibleReservations ]]; then + forceDelete="true" +fi + # cloud_ForEachInstance [cmd] [extra args to cmd] # # Execute a command for each element in the `instances` array @@ -594,16 +624,30 @@ EOF delete() { $metricsWriteDatapoint "testnet-deploy net-delete-begin=1" - # Filter for all nodes - filter="$prefix-" + case $cloudProvider in + gce | ec2 | azure) + # Filter for all nodes + filter="$prefix-" + ;; + colo) + if [[ -n $forceDelete ]]; then + filter=".*-" + else + filter="$prefix-" + fi + ;; + *) + echo "Error: Unknown cloud provider: $cloudProvider" + ;; + esac echo "Searching for instances: $filter" - cloud_FindInstances "$filter" + cloud_FindInstances "$filter" "$reclaimOnlyPreemptibleReservations" if [[ ${#instances[@]} -eq 0 ]]; then echo "No instances found matching '$filter'" else - cloud_DeleteInstances true & + cloud_DeleteInstances $forceDelete fi wait @@ -817,7 +861,7 @@ EOF cloud_CreateInstances "$prefix" "$prefix-bootstrap-validator" 1 \ "$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$validatorBootDiskSizeInGb" \ "$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$validatorAdditionalDiskSizeInGb" \ - "never preemptible" "$sshPrivateKey" + "$maybePreemptible" "$sshPrivateKey" fi if [[ $additionalValidatorCount -gt 0 ]]; then @@ -847,19 +891,19 @@ EOF if [[ $clientNodeCount -gt 0 ]]; then cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \ "$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \ - "$startupScript" "" "$bootDiskType" "" "never preemptible" "$sshPrivateKey" + "$startupScript" "" "$bootDiskType" "" "$maybePreemptible" "$sshPrivateKey" fi if $blockstreamer; then cloud_CreateInstances "$prefix" "$prefix-blockstreamer" "1" \ "$enableGpu" "$blockstreamerMachineType" "${zones[0]}" "$validatorBootDiskSizeInGb" \ - "$startupScript" "$blockstreamerAddress" "$bootDiskType" "" "$sshPrivateKey" + "$startupScript" "$blockstreamerAddress" "$bootDiskType" "" "$maybePreemptible" "$sshPrivateKey" fi if [[ $archiverNodeCount -gt 0 ]]; then cloud_CreateInstances "$prefix" "$prefix-archiver" "$archiverNodeCount" \ false "$archiverMachineType" "${zones[0]}" "$archiverBootDiskSizeInGb" \ - "$startupScript" "" "" "" "never preemptible" "$sshPrivateKey" + "$startupScript" "" "" "" "$maybePreemptible" "$sshPrivateKey" fi $metricsWriteDatapoint "testnet-deploy net-create-complete=1" diff --git a/net/scripts/colo-node-onacquire.sh b/net/scripts/colo-node-onacquire.sh index 25aed22f7..9346996ad 100644 --- a/net/scripts/colo-node-onacquire.sh +++ b/net/scripts/colo-node-onacquire.sh @@ -3,6 +3,7 @@ # These variable must be set before the main body is called SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE:?}" INSTANCE_NAME="${INSTANCE_NAME:?}" +PREEMPTIBLE="${PREEMPTIBLE:?}" SSH_AUTHORIZED_KEYS="${SSH_AUTHORIZED_KEYS:?}" SSH_PRIVATE_KEY_TEXT="${SSH_PRIVATE_KEY_TEXT:?}" SSH_PUBLIC_KEY_TEXT="${SSH_PUBLIC_KEY_TEXT:?}" @@ -16,6 +17,7 @@ if [[ ! -f "${SOLANA_LOCK_FILE}" ]]; then { echo "export SOLANA_LOCK_USER=${SOLANA_USER}" echo "export SOLANA_LOCK_INSTANCENAME=${INSTANCE_NAME}" + echo "export PREEMPTIBLE=${PREEMPTIBLE}" echo "[[ -v SSH_TTY && -f \"${HOME}/.solana-motd\" ]] && cat \"${HOME}/.solana-motd\" 1>&2" } >&9 exec 9>&- diff --git a/net/scripts/colo-node-onfree.sh b/net/scripts/colo-node-onfree.sh index 077034566..c541b210d 100644 --- a/net/scripts/colo-node-onfree.sh +++ b/net/scripts/colo-node-onfree.sh @@ -4,6 +4,7 @@ SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE:?}" SECONDARY_DISK_MOUNT_POINT="${SECONDARY_DISK_MOUNT_POINT:?}" SSH_AUTHORIZED_KEYS="${SSH_AUTHORIZED_KEYS:?}" +FORCE_DELETE="${FORCE_DELETE}" RC=false if [[ -f "${SOLANA_LOCK_FILE}" ]]; then @@ -11,7 +12,7 @@ if [[ -f "${SOLANA_LOCK_FILE}" ]]; then flock -x -n 9 || ( echo "Failed to acquire lock!" 1>&2 && exit 1 ) # shellcheck disable=SC1090 . "${SOLANA_LOCK_FILE}" - if [[ "${SOLANA_LOCK_USER}" = "${SOLANA_USER}" ]]; then + if [[ "${SOLANA_LOCK_USER}" = "${SOLANA_USER}" || -n "${FORCE_DELETE}" ]]; then # Begin running process cleanup CLEANUP_PID=$$ CLEANUP_PIDS=() diff --git a/net/scripts/colo-provider.sh b/net/scripts/colo-provider.sh index 818d78f14..c68e834eb 100755 --- a/net/scripts/colo-provider.sh +++ b/net/scripts/colo-provider.sh @@ -39,6 +39,7 @@ cloud_RestartPreemptedInstances() { __cloud_FindInstances() { declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME INSTANCES_TEXT declare filter=${1} + declare onlyPreemptible=${2} instances=() if ! ${COLO_PARALLELIZE}; then @@ -47,10 +48,14 @@ __cloud_FindInstances() { fi INSTANCES_TEXT="$( for AVAIL in "${COLO_RES_AVAILABILITY[@]}"; do - IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME <<<"${AVAIL}" + IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE <<<"${AVAIL}" if [[ ${INSTNAME} =~ ${filter} ]]; then - printf "%-40s | publicIp=%-16s privateIp=%s zone=%s\n" "${INSTNAME}" "${IP}" "${PRIV_IP}" "${ZONE}" 1>&2 - echo -e "${INSTNAME}:${IP}:${PRIV_IP}:${ZONE}" + if [[ -n $onlyPreemptible && $PREEMPTIBLE == "false" ]]; then + continue + else + printf "%-40s | publicIp=%-16s privateIp=%s zone=%s preemptible=%s\n" "${INSTNAME}" "${IP}" "${PRIV_IP}" "${ZONE}" "${PREEMPTIBLE}" 1>&2 + echo -e "${INSTNAME}:${IP}:${PRIV_IP}:${ZONE}" + fi fi done | sort -t $'\v' -k1 )" @@ -77,7 +82,8 @@ __cloud_FindInstances() { # cloud_FindInstances() { declare filter="^${1}.*" - __cloud_FindInstances "${filter}" + declare onlyPreemptible="${2}" + __cloud_FindInstances "${filter}" "${onlyPreemptible}" } # @@ -96,7 +102,8 @@ cloud_FindInstances() { # cloud_FindInstance() { declare name="^${1}$" - __cloud_FindInstances "${name}" + declare onlyPreemptible="${2}" + __cloud_FindInstances "${name}" "${onlyPreemptible}" } # @@ -155,7 +162,7 @@ cloud_CreateInstances() { #declare optionalAddress="${9}" # unused #declare optionalBootDiskType="${10}" # unused #declare optionalAdditionalDiskSize="${11}" # unused - #declare optionalPreemptible="${12}" # unused + declare optionalPreemptible="${12}" declare sshPrivateKey="${13}" declare -a nodes @@ -213,7 +220,7 @@ cloud_CreateInstances() { RES_MACH="${COLO_RES_MACHINE[${RI}]}" IP="${COLO_RES_IP[${RI}]}" if colo_machine_types_compatible "${RES_MACH}" "${machineType}"; then - if colo_node_requisition "${IP}" "${node}" "${sshPrivateKey}" >/dev/null; then + if colo_node_requisition "${IP}" "${node}" "${sshPrivateKey}" "${optionalPreemptible}" >/dev/null; then NI=$((NI+1)) fi fi @@ -228,10 +235,11 @@ cloud_CreateInstances() { # Deletes all the instances listed in the `instances` array # cloud_DeleteInstances() { + declare forceDelete="${1}" declare _ IP _ _ for instance in "${instances[@]}"; do IFS=':' read -r _ IP _ _ <<< "${instance}" - colo_node_free "${IP}" >/dev/null + colo_node_free "${IP}" "${forceDelete}" >/dev/null done } @@ -270,13 +278,13 @@ cloud_FetchFile() { } cloud_StatusAll() { - declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME + declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE if ! ${COLO_PARALLELIZE}; then colo_load_resources colo_load_availability false fi for AVAIL in "${COLO_RES_AVAILABILITY[@]}"; do - IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME <<<"${AVAIL}" - printf "%-30s | publicIp=%-16s privateIp=%s status=%s who=%s zone=%s inst=%s\n" "${HOST_NAME}" "${IP}" "${PRIV_IP}" "${STATUS}" "${LOCK_USER}" "${ZONE}" "${INSTNAME}" + IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE <<<"${AVAIL}" + printf "%-30s | publicIp=%-16s privateIp=%s status=%s who=%s zone=%s inst=%s preemptible=%s\n" "${HOST_NAME}" "${IP}" "${PRIV_IP}" "${STATUS}" "${LOCK_USER}" "${ZONE}" "${INSTNAME}" "${PREEMPTIBLE}" done } diff --git a/net/scripts/colo-utils.sh b/net/scripts/colo-utils.sh index 3e92b724f..60682126d 100644 --- a/net/scripts/colo-utils.sh +++ b/net/scripts/colo-utils.sh @@ -46,17 +46,17 @@ declare COLO_RES_AVAILABILITY_CACHED=false declare -ax COLO_RES_AVAILABILITY colo_load_availability() { declare USE_CACHE=${1:-${COLO_RES_AVAILABILITY_CACHED}} - declare LINE PRIV_IP STATUS LOCK_USER I IP HOST_NAME ZONE INSTNAME + declare LINE PRIV_IP STATUS LOCK_USER I IP HOST_NAME ZONE INSTNAME PREEMPTIBLE if ! ${USE_CACHE}; then COLO_RES_AVAILABILITY=() COLO_RES_REQUISITIONED=() while read -r LINE; do - IFS=$'\v' read -r IP STATUS LOCK_USER INSTNAME <<< "${LINE}" + IFS=$'\v' read -r IP STATUS LOCK_USER INSTNAME PREEMPTIBLE <<< "${LINE}" I=$(colo_res_index_from_ip "${IP}") PRIV_IP="${COLO_RES_IP_PRIV[${I}]}" HOST_NAME="${COLO_RES_HOSTNAME[${I}]}" ZONE="${COLO_RES_ZONE[${I}]}" - COLO_RES_AVAILABILITY+=( "$(echo -e "${HOST_NAME}\v${IP}\v${PRIV_IP}\v${STATUS}\v${ZONE}\v${LOCK_USER}\v${INSTNAME}")" ) + COLO_RES_AVAILABILITY+=( "$(echo -e "${HOST_NAME}\v${IP}\v${PRIV_IP}\v${STATUS}\v${ZONE}\v${LOCK_USER}\v${INSTNAME}\v${PREEMPTIBLE}")" ) done < <(colo_node_status_all | sort -t $'\v' -k1) COLO_RES_AVAILABILITY_CACHED=true fi @@ -142,15 +142,15 @@ __colo_node_status_script() { # the time due to ${SOLANA_LOCK_FILE} not existing and is running from a # subshell where normal redirection doesn't work exec 9<"${SOLANA_LOCK_FILE}" && flock -s 9 && . "${SOLANA_LOCK_FILE}" && exec 9>&- - echo -e "\${SOLANA_LOCK_USER}\\v\${SOLANA_LOCK_INSTANCENAME}\\vEOL" + echo -e "\${SOLANA_LOCK_USER}\\v\${SOLANA_LOCK_INSTANCENAME}\\v\${PREEMPTIBLE}\\vEOL" exec 2>&3 # Restore stderr EOF } __colo_node_status_result_normalize() { - declare IP RC US BY INSTNAME EOL + declare IP RC US BY INSTNAME PREEMPTIBLE EOL declare ST="DOWN" - IFS=$'\v' read -r IP RC US INSTNAME EOL <<< "${1}" + IFS=$'\v' read -r IP RC US INSTNAME PREEMPTIBLE EOL <<< "${1}" if [ "${RC}" -eq 0 ]; then [[ "${EOL}" = "EOL" ]] || echo "${FUNCNAME[0]}: Unexpected input \"${1}\"" 1>&2 if [ -n "${US}" ]; then @@ -163,7 +163,7 @@ __colo_node_status_result_normalize() { ST="FREE" fi fi - echo -e $"${IP}\v${ST}\v${BY}\v${INSTNAME}" + echo -e $"${IP}\v${ST}\v${BY}\v${INSTNAME}\v${PREEMPTIBLE}" } colo_node_status() { @@ -188,6 +188,7 @@ colo_node_requisition() { declare INSTANCE_NAME=${2} # shellcheck disable=SC2034 declare SSH_PRIVATE_KEY="${3}" + declare PREEMPTIBLE="${4}" declare INDEX INDEX=$(colo_res_index_from_ip "${IP}") @@ -196,6 +197,7 @@ colo_node_requisition() { colo_instance_run "${IP}" "$(cat <