fix(ci): delete GCP resources, but keep some recent cached state images (#5082)
* Fix delete GCP resources commands * Don't create a GCP credentials file * Keep the latest 2 images * Explain time * Show the names of disks that are being deleted * Actually run the image delete steps * Only delete commit-based instance templates * Document automated deletion
This commit is contained in:
parent
c081fd9873
commit
fec012a006
|
@ -1,10 +1,19 @@
|
|||
name: Delete GCP resources
|
||||
|
||||
on:
|
||||
# Run right before Teor's week starts (0500 in UTC+10)
|
||||
schedule:
|
||||
- cron: "0 0 1 * *"
|
||||
- cron: "0 19 * * 0"
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
# Delete all resources created before $DELETE_AGE_DAYS days ago.
|
||||
DELETE_AGE_DAYS: 7
|
||||
# But keep the latest $KEEP_LATEST_IMAGE_COUNT images of each type.
|
||||
#
|
||||
# TODO: reduce this to 1 or 2 after "The resource is not ready" errors get fixed?
|
||||
KEEP_LATEST_IMAGE_COUNT: 3
|
||||
|
||||
jobs:
|
||||
delete-resources:
|
||||
name: Delete old GCP resources
|
||||
|
@ -13,6 +22,10 @@ jobs:
|
|||
contents: 'read'
|
||||
id-token: 'write'
|
||||
steps:
|
||||
- uses: actions/checkout@v3.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
# Setup gcloud CLI
|
||||
- name: Authenticate to Google Cloud
|
||||
id: auth
|
||||
|
@ -23,40 +36,91 @@ jobs:
|
|||
service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
|
||||
token_format: 'access_token'
|
||||
|
||||
# Deletes all the instances template older than 30 days
|
||||
# Deletes all the instance templates older than $DELETE_AGE_DAYS days.
|
||||
- name: Delete old instance templates
|
||||
run: |
|
||||
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)')
|
||||
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
|
||||
TEMPLATES=$(gcloud compute instance-templates list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||
|
||||
for TEMPLATE in $TEMPLATES
|
||||
do
|
||||
gcloud compute instance-templates delete ${TEMPLATE} --quiet || continue
|
||||
gcloud compute instance-templates delete ${TEMPLATE} || continue
|
||||
done
|
||||
|
||||
# Deletes cached images older than 90 days
|
||||
# Deletes all the disks older than $DELETE_AGE_DAYS days.
|
||||
#
|
||||
# A search is done is done for each of this images:
|
||||
# - Images created on Pull Requests older than 30 days
|
||||
# - Images created on the `main` branch older than 60 days
|
||||
# - Any other remaining image older than 90 days
|
||||
# TODO: we should improve this approach and filter by disk type, and just keep the 2 latest images of each type (zebra checkpoint, zebra tip, lwd tip)
|
||||
# Disks that are attached to an instance template can't be deleted, so it is safe to delete all disks here.
|
||||
- name: Delete old disks
|
||||
run: |
|
||||
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
|
||||
|
||||
# Disks created by PR jobs, and other jobs that use a commit hash
|
||||
COMMIT_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~-[0-9a-f]+$ AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||
|
||||
for DISK in $COMMIT_DISKS
|
||||
do
|
||||
gcloud compute disks delete --verbosity=info ${DISK} || continue
|
||||
done
|
||||
|
||||
# Disks created by managed instance groups, and other jobs that start with "zebrad-"
|
||||
ZEBRAD_DISKS=$(gcloud compute disks list --sort-by=creationTimestamp --filter="name~^zebrad- AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||
|
||||
for DISK in $ZEBRAD_DISKS
|
||||
do
|
||||
gcloud compute disks delete --verbosity=info ${DISK} || continue
|
||||
done
|
||||
|
||||
# Deletes cache images older than $DELETE_AGE_DAYS days.
|
||||
#
|
||||
# Keeps the latest $KEEP_LATEST_IMAGE_COUNT images of each type:
|
||||
# - zebrad checkpoint cache
|
||||
# - zebrad tip cache
|
||||
# - lightwalletd + zebrad tip cache
|
||||
#
|
||||
# TODO: when we add testnet to the workflows, keep the latest $KEEP_LATEST_IMAGE_COUNT testnet images,
|
||||
# and the latest $KEEP_LATEST_IMAGE_COUNT mainnet images.
|
||||
- name: Delete old cache disks
|
||||
run: |
|
||||
PR_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-.+[0-9a-f]+-merge AND creationTimestamp < $(date --date='30 days ago' '+%Y%m%d')" --format='value(NAME)')
|
||||
for DISK in $PR_OLD_CACHE_DISKS
|
||||
DELETE_BEFORE_DATE=$(date --date="$DELETE_AGE_DAYS days ago" '+%Y%m%d')
|
||||
|
||||
ZEBRAD_CHECKPOINT_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-checkpoint AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||
KEPT_IMAGES=0
|
||||
for IMAGE in $ZEBRAD_CHECKPOINT_IMAGES
|
||||
do
|
||||
gcloud compute image delete ${DISK} --quiet || continue
|
||||
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
|
||||
then
|
||||
KEPT_IMAGES=$((KEPT_IMAGES+1))
|
||||
echo "Keeping image $KEPT_IMAGES named $IMAGE"
|
||||
continue
|
||||
fi
|
||||
|
||||
gcloud compute images delete ${IMAGE} || continue
|
||||
done
|
||||
|
||||
MAIN_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache-main AND creationTimestamp < $(date --date='60 days ago' '+%Y%m%d')" --format='value(NAME)')
|
||||
for DISK in $MAIN_OLD_CACHE_DISKS
|
||||
ZEBRAD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^zebrad-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||
KEPT_IMAGES=0
|
||||
for IMAGE in $ZEBRAD_TIP_IMAGES
|
||||
do
|
||||
gcloud compute image delete ${DISK} --quiet || continue
|
||||
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
|
||||
then
|
||||
KEPT_IMAGES=$((KEPT_IMAGES+1))
|
||||
echo "Keeping image $KEPT_IMAGES named $IMAGE"
|
||||
continue
|
||||
fi
|
||||
|
||||
gcloud compute images delete ${IMAGE} || continue
|
||||
done
|
||||
|
||||
|
||||
ALL_OLD_CACHE_DISKS=$(gcloud compute images list --sort-by=creationTimestamp --filter="name~-cache- AND creationTimestamp < $(date --date='90 days ago' '+%Y%m%d')" --format='value(NAME)')
|
||||
for DISK in $ALL_OLD_CACHE_DISKS
|
||||
LWD_TIP_IMAGES=$(gcloud compute images list --sort-by=~creationTimestamp --filter="name~^lwd-cache-.*net-tip AND creationTimestamp < $DELETE_BEFORE_DATE" --format='value(NAME)')
|
||||
KEPT_IMAGES=0
|
||||
for IMAGE in $LWD_TIP_IMAGES
|
||||
do
|
||||
gcloud compute image delete ${DISK} --quiet || continue
|
||||
if [[ "$KEPT_IMAGES" -lt "$KEEP_LATEST_IMAGE_COUNT" ]];
|
||||
then
|
||||
KEPT_IMAGES=$((KEPT_IMAGES+1))
|
||||
echo "Keeping image $KEPT_IMAGES named $IMAGE"
|
||||
continue
|
||||
fi
|
||||
|
||||
gcloud compute images delete ${IMAGE} || continue
|
||||
done
|
||||
|
|
|
@ -20,6 +20,26 @@ any branch and commit, as long as the state version is the same.
|
|||
Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners.
|
||||
|
||||
|
||||
## Manually Using Google Cloud
|
||||
|
||||
Some Zebra developers have access to the Zcash Foundation's Google Cloud instance, which also runs our automatic CI.
|
||||
|
||||
Please shut down large instances when they are not being used.
|
||||
|
||||
### Automated Deletion
|
||||
|
||||
The [Delete GCP Resources](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/delete-gcp-resources.yml)
|
||||
workflow automatically deletes instance templates, disks, and images older than 1 week.
|
||||
|
||||
Running instances and their disks are protected from deletion.
|
||||
|
||||
If you want to keep instance templates, disks, or images in Google Cloud, name them so they don't match the automated names:
|
||||
- deleted instance templates and disks end in a commit hash, so use a name ending in `-` or `-[^0-9a-f]+`
|
||||
- deleted images start with `zebrad-cache` or `lwd-cache`, so use a name starting with anything else
|
||||
|
||||
Our other Google Cloud projects don't have automated deletion, so you can also use them for experiments or production deployments.
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
To improve CI performance, some Docker tests are stateful.
|
||||
|
|
Loading…
Reference in New Issue