
334 lines
9.4 KiB

# |source| this file
# Utilities for working with gcloud
# gcloud_FindInstances [filter] [options]
# Find instances matching the specified pattern.
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:zone:public IP:private IP"
# filter - The instances to filter on
# options - If set to the string "show", the list of instances will be echoed
# to stdout
# examples:
# $ gcloud_FindInstances "name=exact-machine-name"
# $ gcloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
gcloud_FindInstances() {
declare filter="$1"
declare options="$2"
declare name zone publicIp privateIp status
while read -r name zone publicIp privateIp status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $name is not RUNNING, ignoring it."
if [[ $options = show ]]; then
printf "%-30s | %-16s publicIp=%-16s privateIp=%s\n" "$name" "$zone" "$publicIp" "$privateIp"
done < <(gcloud compute instances list \
--filter="$filter" \
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
# gcloud_ForEachInstance [cmd] [extra args to cmd]
# Execute a command for each element in the `instances` array
# cmd - The command to execute on each instance
# The command will receive arguments followed by any
# additionl arguments supplied to gcloud_ForEachInstance:
# name - name of the instance
# zone - zone the instance is located in
# publicIp - The public IP address of this instance
# privateIp - The priate IP address of this instance
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
# ... - Extra args to cmd..
gcloud_ForEachInstance() {
declare cmd="$1"
[[ -n $cmd ]] || { echo gcloud_ForEachInstance: cmd not specified; exit 1; }
declare count=1
for info in "${instances[@]}"; do
declare name zone publicIp privateIp
IFS=: read -r name zone publicIp privateIp < <(echo "$info")
eval "$cmd" "$name" "$zone" "$publicIp" "$privateIp" "$count" "$@"
count=$((count + 1))
# gcloud_CreateInstances [namePrefix] [numNodes] [zone] [imageName]
# [machineType] [bootDiskSize] [accelerator]
# [startupScript] [address]
# Creates one more identical instances.
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# zone - zone to create the instances in
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional disk of the boot disk
# accelerator - Optional accelerator to attach to the instance(s), see
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting |zone|
# Tip: use gcloud_FindInstances to locate the instances once this function
# returns
gcloud_CreateInstances() {
declare namePrefix="$1"
declare numNodes="$2"
declare zone="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalAccelerator="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare nodes
if [[ $numNodes = 1 ]]; then
read -ra nodes <<<$(seq -f "${namePrefix}%g" 1 "$numNodes")
declare -a args
if [[ -n $optionalBootDiskSize ]]; then
if [[ -n $optionalAccelerator ]]; then
--maintenance-policy TERMINATE
if [[ -n $optionalStartupScript ]]; then
--metadata-from-file "startup-script=$optionalStartupScript"
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
set -x
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
# gcloud_DeleteInstances [yes]
# Deletes all the instances listed in the `instances` array
# If yes = "true", skip the delete confirmation
gcloud_DeleteInstances() {
declare maybeQuiet=
if [[ $1 = true ]]; then
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
declare names=("${instances[@]/:*/}")
# Assume all instances are in the same zone
# TODO: One day this assumption will be invalid
declare zone
IFS=: read -r _ zone _ < <(echo "${instances[0]}")
set -x
gcloud beta compute instances delete --zone "$zone" $maybeQuiet "${names[@]}"
# gcloud_FigureRemoteUsername [instanceInfo]
# The remote username when ssh-ing into GCP instances tends to not be the same
# as the user's local username, but it needs to be discovered by ssh-ing into an
# instance and examining the system.
# On success the gcloud_username global variable is updated
# instanceInfo - an entry from the `instances` array
# example:
# gcloud_FigureRemoteUsername "name:zone:..."
gcloud_FigureRemoteUsername() {
if [[ -n $gcloud_username ]]; then
declare instanceInfo="$1"
declare name zone publicIp
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
echo "Detecting remote username using $zone in $zone:"
# Figure the gcp ssh username
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
# Try to ssh in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -o pipefail
for i in $(seq 1 10); do
if gcloud compute ssh "$name" \
--zone "$zone" -- "echo whoami:\$USER:iamwho" \
| tr -d $'\r '| tee /tmp/whoami-$$; then
sleep 1
echo "Retry $i..."
while IFS=: read -r whoami gcloud_username iamwho ; do
[[ $whoami == "whoami" && $iamwho == "iamwho" ]] && break;
done < /tmp/whoami-$$
rm -f /tmp/whoami-$$
if [[ -z $gcloud_username ]]; then
echo Unable to figure remote user name
exit 1
echo "Remote username: $gcloud_username"
# gcloud_PrepInstancesForSsh [username] [privateKey]
# Prepares all the instances in the `instances` array for ssh with the specified
# keypair. This eliminates the need to use the restrictive |gcloud compute ssh|,
# use plain |ssh| instead.
# username - gcp ssh username as computed by gcloud_FigureRemoteUsername
# privateKey - private key to install on all the instances
gcloud_PrepInstancesForSsh() {
declare username="$1"
declare privateKey="$2"
declare publicKey="$privateKey".pub
declare logDir=log/
mkdir -p $logDir
rm -rf $logDir/gcloud_PrepInstancesForSsh-*
[[ -r $publicKey ]] || {
echo "Unable to read public key: $publicKey"
exit 1
[[ -r $privateKey ]] || {
echo "Unable to read private key: $privateKey"
exit 1
[[ -d $logDir ]] || {
echo "logDir does not exist: $logDir"
exit 1
declare -a pids
for instanceInfo in "${instances[@]}"; do
declare name zone publicIp
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
# TODO: This next subshell runs in series because for unknown reason running
# multiple |gcloud compute ssh| commands in parallel cause the macOS
# terminal to misbehave
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 60s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
gcloud compute ssh --zone "$zone" "$name" -- "
set -x;
mkdir -p .ssh;
echo \"$(cat "$publicKey")\" >> .ssh/authorized_keys;
echo \"
Host *
BatchMode yes
IdentityFile ~/.ssh/id_testnet
StrictHostKeyChecking no
\" > .ssh/config;
) >> "$logFile" 2>&1
set -x
scp \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-i "$privateKey" \
"$privateKey" "$username@$publicIp:.ssh/id_testnet"
) >> "$logFile" 2>&1 &
declare pid=$!
ln -sfT "$logFile" "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
if ! $ok; then
cat "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
echo ^^^ +++
exit 1