fix(ci): delete GCP resources, but keep some recent cached state images (#5082)

* Fix delete GCP resources commands

* Don't create a GCP credentials file

* Keep the latest 2 images

* Explain time

* Show the names of disks that are being deleted

* Actually run the image delete steps

* Only delete commit-based instance templates

* Document automated deletion
This commit is contained in:
teor 2022-09-06 12:51:46 +10:00 committed by GitHub
parent c081fd9873
commit fec012a006
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 105 additions and 21 deletions

View File

@ -1,10 +1,19 @@
name: Delete GCP resources
on:
# Run right before Teor's week starts (0500 in UTC+10)
schedule:
- cron: "0 0 1 * *"
- cron: "0 19 * * 0"
workflow_dispatch:
env:
# Delete all resources created before $DELETE_AGE_DAYS days ago.
DELETE_AGE_DAYS: 7
# But keep the latest $KEEP_LATEST_IMAGE_COUNT images of each type.
#
# TODO: reduce this to 1 or 2 after "The resource is not ready" errors get fixed?
KEEP_LATEST_IMAGE_COUNT: 3
jobs:
delete-resources:
name: Delete old GCP resources
@ -13,6 +22,10 @@ jobs:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v3.0.2
with:
persist-credentials: false
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
@ -23,40 +36,91 @@ jobs:
service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
token_format: 'access_token'
# Deletes all the instances template older than 30 days
# Deletes all the instance templates older than $DELETE_AGE_DAYS days.
- name: Delete old instance templates
run: |
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)')
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
for TEMPLATE in $TEMPLATES
do
gcloud compute instance-templates delete ${TEMPLATE} --quiet || continue
gcloud compute instance-templates delete ${TEMPLATE} || continue
done
# Deletes cached images older than 90 days
# Deletes all the disks older than $DELETE_AGE_DAYS days.
#
# A search is done is done for each of this images:
# - Images created on Pull Requests older than 30 days
# - Images created on the `main` branch older than 60 days
# - Any other remaining image older than 90 days
# TODO: we should improve this approach and filter by disk type, and just keep the 2 latest images of each type (zebra checkpoint, zebra tip, lwd tip)
# Disks that are attached to an instance template can't be deleted, so it is safe to delete all disks here.
- name: Delete old disks
run: |
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
# Disks created by PR jobs, and other jobs that use a commit hash
COMMIT_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
for DISK in $COMMIT_DISKS
do
gcloud compute disks delete --verbosity=info ${DISK} || continue
done
# Disks created by managed instance groups, and other jobs that start with "zebrad-"
ZEBRAD_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~^zebrad- AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
for DISK in $ZEBRAD_DISKS
do
gcloud compute disks delete --verbosity=info ${DISK} || continue
done
# Deletes cache images older than $DELETE_AGE_DAYS days.
#
# Keeps the latest $KEEP_LATEST_IMAGE_COUNT images of each type:
# - zebrad checkpoint cache
# - zebrad tip cache
# - lightwalletd + zebrad tip cache
#
# TODO: when we add testnet to the workflows, keep the latest $KEEP_LATEST_IMAGE_COUNT testnet images,
# and the latest $KEEP_LATEST_IMAGE_COUNT mainnet images.
- name: Delete old cache disks
run: |
PR_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-.+[0-9a-f]+-merge AND creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)')
for DISK in $PR_OLD_CACHE_DISKS
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
ZEBRAD_CHECKPOINT_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-checkpoint AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $ZEBRAD_CHECKPOINT_IMAGES
do
gcloud compute image delete ${DISK} --quiet || continue
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete ${IMAGE} || continue
done
MAIN_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-main AND creationTimestamp < $(date --date='60 days ago' '+%Y%m%d')" --format='value(NAME)')
for DISK in $MAIN_OLD_CACHE_DISKS
ZEBRAD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $ZEBRAD_TIP_IMAGES
do
gcloud compute image delete ${DISK} --quiet || continue
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete ${IMAGE} || continue
done
ALL_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache- AND creationTimestamp < $(date --date='90 days ago' '+%Y%m%d')" --format='value(NAME)')
for DISK in $ALL_OLD_CACHE_DISKS
LWD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^lwd-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
KEPT_IMAGES=0
for IMAGE in $LWD_TIP_IMAGES
do
gcloud compute image delete ${DISK} --quiet || continue
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
then
KEPT_IMAGES=$((KEPT_IMAGES+1))
echo "Keeping image $KEPT_IMAGES named $IMAGE"
continue
fi
gcloud compute images delete ${IMAGE} || continue
done

View File

@ -20,6 +20,26 @@ any branch and commit, as long as the state version is the same.
Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners.
## Manually Using Google Cloud
Some Zebra developers have access to the Zcash Foundation's Google Cloud instance, which also runs our automatic CI.
Please shut down large instances when they are not being used.
### Automated Deletion
The [Delete GCP Resources](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/delete-gcp-resources.yml)
workflow automatically deletes instance templates, disks, and images older than 1 week.
Running instances and their disks are protected from deletion.
If you want to keep instance templates, disks, or images in Google Cloud, name them so they don't match the automated names:
- deleted instance templates and disks end in a commit hash, so use a name ending in `-` or `-[^0-9a-f]+`
- deleted images start with `zebrad-cache` or `lwd-cache`, so use a name starting with anything else
Our other Google Cloud projects don't have automated deletion, so you can also use them for experiments or production deployments.
## Troubleshooting
To improve CI performance, some Docker tests are stateful.