Plumb pre-emptibility and associated overrides into colo allocation and automated testing (#8754)

automerge
This commit is contained in:
Dan Albert 2020-03-10 11:25:44 -07:00 committed by GitHub
parent 496999beba
commit 595c96b262
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 89 additions and 29 deletions

View File

@ -61,7 +61,6 @@ colo)
;;
esac
prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
additionalValidatorCount=2
clientNodeCount=0
@ -154,6 +153,7 @@ Manage testnet instances
(by default preemptible instances are used to reduce
cost). Note that the bootstrap validator, archiver,
blockstreamer and client nodes are always dedicated.
Set this flag on colo to prevent your testnet from being pre-empted by nightly test automation.
--self-destruct-hours [number]
- Specify lifetime of the allocated instances in hours. 0 to
disable. Only supported on GCE. (default: $selfDestructHours)
@ -162,7 +162,11 @@ Manage testnet instances
-P - Use public network IP addresses (default: $publicNetwork)
delete-specific options:
none
--reclaim-preemptible-reservations
- If set, reclaims all reservations on colo nodes that were not created with --dedicated.
This behavior does not filter by testnet name or owner. Only implemented on colo.
--reclaim-all-reservations
- If set, reclaims all reservations on all colo nodes, regardless of owner, pre-emptibility, or creator.
info-specific options:
--eval - Output in a form that can be eval-ed by a shell: eval $(gce.sh info)
@ -215,6 +219,12 @@ while [[ -n $1 ]]; do
usage 1
fi
shift 2
elif [[ $1 == --reclaim-preemptible-reservations ]]; then
reclaimOnlyPreemptibleReservations=true
shift
elif [[ $1 == --reclaim-all-reservations ]]; then
reclaimAllReservations=true
shift
else
usage "Unknown long option: $1"
fi
@ -308,6 +318,26 @@ ec2|azure|colo)
;;
esac
case $cloudProvider in
gce | ec2 | azure)
maybePreemptible="never preemptible"
;;
colo)
maybePreemptible=$preemptible
;;
*)
echo "Error: Unknown cloud provider: $cloudProvider"
;;
esac
if [[ $reclaimOnlyPreemptibleReservations == "true" && $reclaimAllReservations == "true" ]]; then
usage "Cannot set both --reclaim-preemptible-reservations and --reclaim-all-reservations. Set one or none"
fi
if [[ -n $reclaimAllReservations || -n $reclaimOnlyPreemptibleReservations ]]; then
forceDelete="true"
fi
# cloud_ForEachInstance [cmd] [extra args to cmd]
#
# Execute a command for each element in the `instances` array
@ -594,16 +624,30 @@ EOF
delete() {
$metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
# Filter for all nodes
filter="$prefix-"
case $cloudProvider in
gce | ec2 | azure)
# Filter for all nodes
filter="$prefix-"
;;
colo)
if [[ -n $forceDelete ]]; then
filter=".*-"
else
filter="$prefix-"
fi
;;
*)
echo "Error: Unknown cloud provider: $cloudProvider"
;;
esac
echo "Searching for instances: $filter"
cloud_FindInstances "$filter"
cloud_FindInstances "$filter" "$reclaimOnlyPreemptibleReservations"
if [[ ${#instances[@]} -eq 0 ]]; then
echo "No instances found matching '$filter'"
else
cloud_DeleteInstances true &
cloud_DeleteInstances $forceDelete
fi
wait
@ -817,7 +861,7 @@ EOF
cloud_CreateInstances "$prefix" "$prefix-bootstrap-validator" 1 \
"$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$validatorBootDiskSizeInGb" \
"$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" "$validatorAdditionalDiskSizeInGb" \
"never preemptible" "$sshPrivateKey"
"$maybePreemptible" "$sshPrivateKey"
fi
if [[ $additionalValidatorCount -gt 0 ]]; then
@ -847,19 +891,19 @@ EOF
if [[ $clientNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType" "" "never preemptible" "$sshPrivateKey"
"$startupScript" "" "$bootDiskType" "" "$maybePreemptible" "$sshPrivateKey"
fi
if $blockstreamer; then
cloud_CreateInstances "$prefix" "$prefix-blockstreamer" "1" \
"$enableGpu" "$blockstreamerMachineType" "${zones[0]}" "$validatorBootDiskSizeInGb" \
"$startupScript" "$blockstreamerAddress" "$bootDiskType" "" "$sshPrivateKey"
"$startupScript" "$blockstreamerAddress" "$bootDiskType" "" "$maybePreemptible" "$sshPrivateKey"
fi
if [[ $archiverNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-archiver" "$archiverNodeCount" \
false "$archiverMachineType" "${zones[0]}" "$archiverBootDiskSizeInGb" \
"$startupScript" "" "" "" "never preemptible" "$sshPrivateKey"
"$startupScript" "" "" "" "$maybePreemptible" "$sshPrivateKey"
fi
$metricsWriteDatapoint "testnet-deploy net-create-complete=1"

View File

@ -3,6 +3,7 @@
# These variable must be set before the main body is called
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE:?}"
INSTANCE_NAME="${INSTANCE_NAME:?}"
PREEMPTIBLE="${PREEMPTIBLE:?}"
SSH_AUTHORIZED_KEYS="${SSH_AUTHORIZED_KEYS:?}"
SSH_PRIVATE_KEY_TEXT="${SSH_PRIVATE_KEY_TEXT:?}"
SSH_PUBLIC_KEY_TEXT="${SSH_PUBLIC_KEY_TEXT:?}"
@ -16,6 +17,7 @@ if [[ ! -f "${SOLANA_LOCK_FILE}" ]]; then
{
echo "export SOLANA_LOCK_USER=${SOLANA_USER}"
echo "export SOLANA_LOCK_INSTANCENAME=${INSTANCE_NAME}"
echo "export PREEMPTIBLE=${PREEMPTIBLE}"
echo "[[ -v SSH_TTY && -f \"${HOME}/.solana-motd\" ]] && cat \"${HOME}/.solana-motd\" 1>&2"
} >&9
exec 9>&-

View File

@ -4,6 +4,7 @@
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE:?}"
SECONDARY_DISK_MOUNT_POINT="${SECONDARY_DISK_MOUNT_POINT:?}"
SSH_AUTHORIZED_KEYS="${SSH_AUTHORIZED_KEYS:?}"
FORCE_DELETE="${FORCE_DELETE}"
RC=false
if [[ -f "${SOLANA_LOCK_FILE}" ]]; then
@ -11,7 +12,7 @@ if [[ -f "${SOLANA_LOCK_FILE}" ]]; then
flock -x -n 9 || ( echo "Failed to acquire lock!" 1>&2 && exit 1 )
# shellcheck disable=SC1090
. "${SOLANA_LOCK_FILE}"
if [[ "${SOLANA_LOCK_USER}" = "${SOLANA_USER}" ]]; then
if [[ "${SOLANA_LOCK_USER}" = "${SOLANA_USER}" || -n "${FORCE_DELETE}" ]]; then
# Begin running process cleanup
CLEANUP_PID=$$
CLEANUP_PIDS=()

View File

@ -39,6 +39,7 @@ cloud_RestartPreemptedInstances() {
__cloud_FindInstances() {
declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME INSTANCES_TEXT
declare filter=${1}
declare onlyPreemptible=${2}
instances=()
if ! ${COLO_PARALLELIZE}; then
@ -47,10 +48,14 @@ __cloud_FindInstances() {
fi
INSTANCES_TEXT="$(
for AVAIL in "${COLO_RES_AVAILABILITY[@]}"; do
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME <<<"${AVAIL}"
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE <<<"${AVAIL}"
if [[ ${INSTNAME} =~ ${filter} ]]; then
printf "%-40s | publicIp=%-16s privateIp=%s zone=%s\n" "${INSTNAME}" "${IP}" "${PRIV_IP}" "${ZONE}" 1>&2
echo -e "${INSTNAME}:${IP}:${PRIV_IP}:${ZONE}"
if [[ -n $onlyPreemptible && $PREEMPTIBLE == "false" ]]; then
continue
else
printf "%-40s | publicIp=%-16s privateIp=%s zone=%s preemptible=%s\n" "${INSTNAME}" "${IP}" "${PRIV_IP}" "${ZONE}" "${PREEMPTIBLE}" 1>&2
echo -e "${INSTNAME}:${IP}:${PRIV_IP}:${ZONE}"
fi
fi
done | sort -t $'\v' -k1
)"
@ -77,7 +82,8 @@ __cloud_FindInstances() {
#
cloud_FindInstances() {
declare filter="^${1}.*"
__cloud_FindInstances "${filter}"
declare onlyPreemptible="${2}"
__cloud_FindInstances "${filter}" "${onlyPreemptible}"
}
#
@ -96,7 +102,8 @@ cloud_FindInstances() {
#
cloud_FindInstance() {
declare name="^${1}$"
__cloud_FindInstances "${name}"
declare onlyPreemptible="${2}"
__cloud_FindInstances "${name}" "${onlyPreemptible}"
}
#
@ -155,7 +162,7 @@ cloud_CreateInstances() {
#declare optionalAddress="${9}" # unused
#declare optionalBootDiskType="${10}" # unused
#declare optionalAdditionalDiskSize="${11}" # unused
#declare optionalPreemptible="${12}" # unused
declare optionalPreemptible="${12}"
declare sshPrivateKey="${13}"
declare -a nodes
@ -213,7 +220,7 @@ cloud_CreateInstances() {
RES_MACH="${COLO_RES_MACHINE[${RI}]}"
IP="${COLO_RES_IP[${RI}]}"
if colo_machine_types_compatible "${RES_MACH}" "${machineType}"; then
if colo_node_requisition "${IP}" "${node}" "${sshPrivateKey}" >/dev/null; then
if colo_node_requisition "${IP}" "${node}" "${sshPrivateKey}" "${optionalPreemptible}" >/dev/null; then
NI=$((NI+1))
fi
fi
@ -228,10 +235,11 @@ cloud_CreateInstances() {
# Deletes all the instances listed in the `instances` array
#
cloud_DeleteInstances() {
declare forceDelete="${1}"
declare _ IP _ _
for instance in "${instances[@]}"; do
IFS=':' read -r _ IP _ _ <<< "${instance}"
colo_node_free "${IP}" >/dev/null
colo_node_free "${IP}" "${forceDelete}" >/dev/null
done
}
@ -270,13 +278,13 @@ cloud_FetchFile() {
}
cloud_StatusAll() {
declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME
declare HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE
if ! ${COLO_PARALLELIZE}; then
colo_load_resources
colo_load_availability false
fi
for AVAIL in "${COLO_RES_AVAILABILITY[@]}"; do
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME <<<"${AVAIL}"
printf "%-30s | publicIp=%-16s privateIp=%s status=%s who=%s zone=%s inst=%s\n" "${HOST_NAME}" "${IP}" "${PRIV_IP}" "${STATUS}" "${LOCK_USER}" "${ZONE}" "${INSTNAME}"
IFS=$'\v' read -r HOST_NAME IP PRIV_IP STATUS ZONE LOCK_USER INSTNAME PREEMPTIBLE <<<"${AVAIL}"
printf "%-30s | publicIp=%-16s privateIp=%s status=%s who=%s zone=%s inst=%s preemptible=%s\n" "${HOST_NAME}" "${IP}" "${PRIV_IP}" "${STATUS}" "${LOCK_USER}" "${ZONE}" "${INSTNAME}" "${PREEMPTIBLE}"
done
}

View File

@ -46,17 +46,17 @@ declare COLO_RES_AVAILABILITY_CACHED=false
declare -ax COLO_RES_AVAILABILITY
colo_load_availability() {
declare USE_CACHE=${1:-${COLO_RES_AVAILABILITY_CACHED}}
declare LINE PRIV_IP STATUS LOCK_USER I IP HOST_NAME ZONE INSTNAME
declare LINE PRIV_IP STATUS LOCK_USER I IP HOST_NAME ZONE INSTNAME PREEMPTIBLE
if ! ${USE_CACHE}; then
COLO_RES_AVAILABILITY=()
COLO_RES_REQUISITIONED=()
while read -r LINE; do
IFS=$'\v' read -r IP STATUS LOCK_USER INSTNAME <<< "${LINE}"
IFS=$'\v' read -r IP STATUS LOCK_USER INSTNAME PREEMPTIBLE <<< "${LINE}"
I=$(colo_res_index_from_ip "${IP}")
PRIV_IP="${COLO_RES_IP_PRIV[${I}]}"
HOST_NAME="${COLO_RES_HOSTNAME[${I}]}"
ZONE="${COLO_RES_ZONE[${I}]}"
COLO_RES_AVAILABILITY+=( "$(echo -e "${HOST_NAME}\v${IP}\v${PRIV_IP}\v${STATUS}\v${ZONE}\v${LOCK_USER}\v${INSTNAME}")" )
COLO_RES_AVAILABILITY+=( "$(echo -e "${HOST_NAME}\v${IP}\v${PRIV_IP}\v${STATUS}\v${ZONE}\v${LOCK_USER}\v${INSTNAME}\v${PREEMPTIBLE}")" )
done < <(colo_node_status_all | sort -t $'\v' -k1)
COLO_RES_AVAILABILITY_CACHED=true
fi
@ -142,15 +142,15 @@ __colo_node_status_script() {
# the time due to ${SOLANA_LOCK_FILE} not existing and is running from a
# subshell where normal redirection doesn't work
exec 9<"${SOLANA_LOCK_FILE}" && flock -s 9 && . "${SOLANA_LOCK_FILE}" && exec 9>&-
echo -e "\${SOLANA_LOCK_USER}\\v\${SOLANA_LOCK_INSTANCENAME}\\vEOL"
echo -e "\${SOLANA_LOCK_USER}\\v\${SOLANA_LOCK_INSTANCENAME}\\v\${PREEMPTIBLE}\\vEOL"
exec 2>&3 # Restore stderr
EOF
}
__colo_node_status_result_normalize() {
declare IP RC US BY INSTNAME EOL
declare IP RC US BY INSTNAME PREEMPTIBLE EOL
declare ST="DOWN"
IFS=$'\v' read -r IP RC US INSTNAME EOL <<< "${1}"
IFS=$'\v' read -r IP RC US INSTNAME PREEMPTIBLE EOL <<< "${1}"
if [ "${RC}" -eq 0 ]; then
[[ "${EOL}" = "EOL" ]] || echo "${FUNCNAME[0]}: Unexpected input \"${1}\"" 1>&2
if [ -n "${US}" ]; then
@ -163,7 +163,7 @@ __colo_node_status_result_normalize() {
ST="FREE"
fi
fi
echo -e $"${IP}\v${ST}\v${BY}\v${INSTNAME}"
echo -e $"${IP}\v${ST}\v${BY}\v${INSTNAME}\v${PREEMPTIBLE}"
}
colo_node_status() {
@ -188,6 +188,7 @@ colo_node_requisition() {
declare INSTANCE_NAME=${2}
# shellcheck disable=SC2034
declare SSH_PRIVATE_KEY="${3}"
declare PREEMPTIBLE="${4}"
declare INDEX
INDEX=$(colo_res_index_from_ip "${IP}")
@ -196,6 +197,7 @@ colo_node_requisition() {
colo_instance_run "${IP}" "$(cat <<EOF
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE}"
INSTANCE_NAME="${INSTANCE_NAME}"
PREEMPTIBLE="${PREEMPTIBLE}"
SSH_AUTHORIZED_KEYS='$("${__colo_here}"/add-datacenter-solana-user-authorized_keys.sh 2> /dev/null)'
SSH_PRIVATE_KEY_TEXT="$(<"${SSH_PRIVATE_KEY}")"
SSH_PUBLIC_KEY_TEXT="$(<"${SSH_PRIVATE_KEY}.pub")"
@ -238,10 +240,12 @@ colo_machine_types_compatible() {
colo_node_free() {
declare IP=${1}
declare FORCE_DELETE=${2}
colo_instance_run "${IP}" "$(cat <<EOF
SOLANA_LOCK_FILE="${SOLANA_LOCK_FILE}"
SECONDARY_DISK_MOUNT_POINT="${SECONDARY_DISK_MOUNT_POINT}"
SSH_AUTHORIZED_KEYS='$("${__colo_here}"/add-datacenter-solana-user-authorized_keys.sh 2> /dev/null)'
FORCE_DELETE="${FORCE_DELETE}"
$(<"${__colo_here}"/colo-node-onfree.sh)
EOF
)"

View File

@ -160,6 +160,7 @@ function launchTestnet() {
${ADDITIONAL_FLAGS[@]/#/" "}
;;
colo)
net/colo.sh delete --reclaim-preemptible-reservations
# shellcheck disable=SC2068
# shellcheck disable=SC2086
net/colo.sh create \