zebra/.github/workflows/delete-gcp-resources.yml

274 lines
11 KiB
YAML

# TODO: rename this action name and filename to Delete infra resources
name: Delete GCP resources
on:
# Run daily, when most devs aren't working
# 0700 UTC is after AEST working hours but before ET working hours
schedule:
- cron: "0 7 * * *"
workflow_dispatch:
env:
# Delete all resources created before $DELETE_INSTANCE_DAYS days ago.
# We keep this short to reduce CPU, RAM, and storage costs.
DELETE_INSTANCE_DAYS: 3
# Delete all other resources created before $DELETE_AGE_DAYS days ago.
# We keep this short to reduce storage costs.
DELETE_AGE_DAYS: 2
# But keep the latest $KEEP_LATEST_IMAGE_COUNT images of each type.
# We keep this small to reduce storage costs.
KEEP_LATEST_IMAGE_COUNT: 2
# Delete all artifacts in registry created before $DELETE_IMAGE_HOURS hours ago.
# We keep this long enough for PRs that are still on the same commit can re-run with the same image.
DELETE_IMAGE_HOURS: 504h # 21 days
jobs:
delete-resources:
name: Delete old GCP resources
runs-on: ubuntu-latest
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v3.5.3
with:
persist-credentials: false
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v1.1.1
with:
retries: '3'
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v1.1.1
# Deletes all mainnet and testnet instances older than $DELETE_INSTANCE_DAYS days.
#
# We only delete instances that end in 7 or more hex characters,
# to avoid deleting managed instance groups and manually created instances.
#
# ${INSTANCE_AND_ZONE} expands to:
# <instance-name> --zone=<zone-name>
# so it can't be shell-quoted.
- name: Delete old instances
run: |
DELETE_BEFORE_DATE=$(date --date="$DELETE_INSTANCE_DAYS days ago" '+%Y%m%d')
IFS=$'\n'
INSTANCES=$(gcloud compute instances list --sort-by=creationTimestamp --filter="name~-[0-9a-f]{7,}$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME,ZONE)' | \
sed 's/\(.*\)\t\(.*\)/\1 --zone=\2/')
for INSTANCE_AND_ZONE in $INSTANCES
do
IFS=$' '
gcloud compute instances delete --verbosity=info ${INSTANCE_AND_ZONE} --delete-disks=all || continue
IFS=$'\n'
done
# Deletes all the instance templates older than $DELETE_AGE_DAYS days.
- name: Delete old instance templates
run: |
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="name~-[0-9a-f]{7,}$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
for TEMPLATE in $TEMPLATES
do
gcloud compute instance-templates delete "${TEMPLATE}" || continue
done
# Deletes all mainnet and testnet disks older than $DELETE_AGE_DAYS days.
#
# Disks that are attached to an instance template can't be deleted, so it is safe to try to delete all disks here.
#
# ${DISK_AND_LOCATION} expands to:
# <disk-name> --[zone|region]=<location-name>
# so it can't be shell-quoted.
- name: Delete old disks
run: |
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
IFS=$'\n'
# Disks created by PR jobs, and other jobs that use a commit hash
COMMIT_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~-[0-9a-f]{7,}$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME,LOCATION,LOCATION_SCOPE)' | \
sed 's/\(.*\)\t\(.*\)\t\(.*\)/\1 --\3=\2/')
for DISK_AND_LOCATION in $COMMIT_DISKS
do
IFS=$' '
gcloud compute disks delete --verbosity=info ${DISK_AND_LOCATION} || continue
IFS=$'\n'
done
IFS=$'\n'
# Disks created by managed instance groups, and other jobs that start with "zebrad-"
ZEBRAD_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~^zebrad- AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME,LOCATION,LOCATION_SCOPE)' | \
sed 's/\(.*\)\t\(.*\)\t\(.*\)/\1 --\3=\2/')
for DISK_AND_LOCATION in $ZEBRAD_DISKS
do
IFS=$' '
gcloud compute disks delete --verbosity=info ${DISK_AND_LOCATION} || continue
IFS=$'\n'
done
# Deletes mainnet and testnet cache images older than $DELETE_AGE_DAYS days.
#
# Keeps all images younger than $DELETE_AGE_DAYS.
# Also keeps $KEEP_LATEST_IMAGE_COUNT older images of each type, for each network:
# - zebrad checkpoint cache
# - zebrad tip cache
# - lightwalletd + zebrad tip cache
#
# TODO:
# - refactor out repeated shell script code
- name: Delete old cache disks
run: |
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
# As of April 2023, these disk names look like:
# zebrad-cache-6039-merge-62c8ecc-v25-mainnet-checkpoint-053559
#
# Mainnet zebrad checkpoint
ZEBRAD_MAINNET_CHECKPOINT_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*-mainnet-checkpoint AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $ZEBRAD_MAINNET_CHECKPOINT_IMAGES
do
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete "${IMAGE}" || continue
done
# Testnet zebrad checkpoint
ZEBRAD_TESTNET_CHECKPOINT_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*-testnet-checkpoint AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $ZEBRAD_TESTNET_CHECKPOINT_IMAGES
do
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete "${IMAGE}" || continue
done
# As of April 2023, these disk names look like:
# zebrad-cache-6556-merge-a2ca4de-v25-mainnet-tip(-u)?-140654
#
# Mainnet zebrad tip
ZEBRAD_MAINNET_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*-mainnet-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $ZEBRAD_MAINNET_TIP_IMAGES
do
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete "${IMAGE}" || continue
done
# Testnet zebrad tip
ZEBRAD_TESTNET_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*-testnet-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $ZEBRAD_TESTNET_TIP_IMAGES
do
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete "${IMAGE}" || continue
done
# As of April 2023, these disk names look like:
# lwd-cache-main-fb3fec0-v25-mainnet-tip(-u)?-061314
#
# Mainnet lightwalletd tip
LWD_MAINNET_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^lwd-cache-.*-mainnet-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $LWD_MAINNET_TIP_IMAGES
do
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete "${IMAGE}" || continue
done
# Testnet lightwalletd tip
LWD_TESTNET_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^lwd-cache-.*-testnet-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $LWD_TESTNET_TIP_IMAGES
do
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete "${IMAGE}" || continue
done
# We're using a generic approach here, which allows multiple registries to be included,
# even those not related to GCP. Enough reason to create a separate job.
#
# The same artifacts are used for both mainnet and testnet.
clean-registries:
name: Delete unused artifacts in registry
runs-on: ubuntu-latest
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v3.5.3
with:
persist-credentials: false
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v1.1.1
with:
retries: '3'
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
token_format: 'access_token'
- name: Login to Google Artifact Registry
uses: docker/login-action@v2.2.0
with:
registry: us-docker.pkg.dev
username: oauth2accesstoken
password: ${{ steps.auth.outputs.access_token }}
# Deletes all images older than $DELETE_IMAGE_HOURS days.
- uses: 'docker://us-docker.pkg.dev/gcr-cleaner/gcr-cleaner/gcr-cleaner-cli'
continue-on-error: true # TODO: remove after fixig https://github.com/ZcashFoundation/zebra/issues/5933
# Refer to the official documentation to understand available arguments:
# https://github.com/GoogleCloudPlatform/gcr-cleaner
with:
args: >-
-repo=us-docker.pkg.dev/${{ vars.GCP_PROJECT }}/zebra/zebrad-test
-repo=us-docker.pkg.dev/${{ vars.GCP_PROJECT }}/zebra/lightwalletd
-grace=${{ env.DELETE_IMAGE_HOURS }}
-keep=${{ env.KEEP_LATEST_IMAGE_COUNT }}