fix(ci): Wait 1 day before creating cached state image updates (#5088)

* Increase search range for sync height

* Update sync height regexes for zebrad and lwd cached states

* Add labels to cached state images

* Update deploy-gcp-tests.yml

* Don't create new cached states for lwd updates

* Add a missing line continuation

* Fix a comment

* Revert a mistaken comment change

* Clarify a TODO comment

* Partially revert to old docker height log handling

* Use an output for the cached disk name
This commit is contained in:
teor 2022-09-09 06:25:00 +10:00 committed by GitHub
parent c6fd7aa96d
commit a58b72c92b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 100 additions and 38 deletions

View File

@ -273,6 +273,7 @@ jobs:
uses: ./.github/workflows/deploy-gcp-tests.yml
if: ${{ !fromJSON(needs.get-available-disks.outputs.zebra_checkpoint_disk) || github.event.inputs.regenerate-disks == 'true' }}
with:
app_name: zebrad
test_id: sync-to-checkpoint
test_description: Test sync up to mandatory checkpoint
test_variables: '-e TEST_DISK_REBUILD=1 -e ZEBRA_FORCE_USE_COLOR=1'
@ -291,6 +292,7 @@ jobs:
uses: ./.github/workflows/deploy-gcp-tests.yml
if: ${{ !cancelled() && !failure() && github.event.inputs.regenerate-disks != 'true' && github.event.inputs.run-full-sync != 'true' }}
with:
app_name: zebrad
test_id: sync-past-checkpoint
test_description: Test full validation sync from a cached state
test_variables: '-e TEST_CHECKPOINT_SYNC=1 -e ZEBRA_FORCE_USE_COLOR=1'
@ -318,6 +320,7 @@ jobs:
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-your-workflow-based-on-the-head-or-base-branch-of-a-pull-request-1
if: ${{ (github.event_name == 'push' && github.ref_name == 'main') || !fromJSON(needs.get-available-disks.outputs.zebra_tip_disk) || github.event.inputs.run-full-sync == 'true' }}
with:
app_name: zebrad
test_id: full-sync-to-tip
test_description: Test a full sync up to the tip
test_variables: '-e TEST_FULL_SYNC=1 -e ZEBRA_FORCE_USE_COLOR=1 -e FULL_SYNC_MAINNET_TIMEOUT_MINUTES=600'
@ -417,8 +420,8 @@ jobs:
test_variables: '-e TEST_LWD_UPDATE_SYNC=1 -e ZEBRA_TEST_LIGHTWALLETD=1 -e ZEBRA_FORCE_USE_COLOR=1 -e ZEBRA_CACHED_STATE_DIR=/var/cache/zebrad-cache -e LIGHTWALLETD_DATA_DIR=/var/cache/lwd-cache'
needs_zebra_state: true
needs_lwd_state: true
# update the disk on every PR, to increase CI speed
saves_to_disk: true
# since we do a full sync in every PR, the new cached state will only be a few minutes newer than the original one
saves_to_disk: false
disk_prefix: lwd-cache
disk_suffix: tip
root_state_path: '/var/cache'

View File

@ -78,7 +78,7 @@ on:
required: false
type: string
default: 'zebra'
description: 'Application name for Google Cloud instance metadata'
description: 'Application name, used to work out when a job is an update job'
env:
# where we get the Docker image from
@ -94,6 +94,9 @@ env:
# but we don't know how long it will be between jobs.
# 200 lines is about 6-15 minutes of sync logs, or one panic log.
EXTRA_LOG_LINES: 200
# How many blocks to wait before creating an updated cached state image.
# 1 day is approximately 1152 blocks.
CACHED_STATE_UPDATE_LIMIT: 1152
jobs:
# set up the test, if it doesn't use any cached state
@ -228,6 +231,8 @@ jobs:
name: Setup ${{ inputs.test_id }} test
if: ${{ inputs.needs_zebra_state }}
runs-on: ubuntu-latest
outputs:
cached_disk_name: ${{ steps.get-disk-name.outputs.cached_disk_name }}
permissions:
contents: 'read'
id-token: 'write'
@ -340,6 +345,7 @@ jobs:
fi
echo "Selected Disk: $CACHED_DISK_NAME"
echo "::set-output name=cached_disk_name::$CACHED_DISK_NAME"
echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV
echo "CACHED_DISK_NAME=$CACHED_DISK_NAME" >> $GITHUB_ENV
@ -1065,7 +1071,7 @@ jobs:
create-state-image:
name: Create ${{ inputs.test_id }} cached state image
runs-on: ubuntu-latest
needs: [ test-result ]
needs: [ test-result, setup-with-cached-state ]
# We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
# Normally, if a job is skipped, all the jobs that depend on it are also skipped.
# So we need to override the default success() check to make this job run.
@ -1120,31 +1126,8 @@ jobs:
echo "STATE_VERSION: $LOCAL_STATE_VERSION"
echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV
# Get the sync height from the test logs, which is later used as part of the
# disk description.
#
# The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }},
# this allows to dynamically change the height as needed by different situations or
# based on the logs output from different tests
#
# Passes the sync height to subsequent steps using $SYNC_HEIGHT env variable
- name: Get sync height from logs
run: |
SYNC_HEIGHT=""
DOCKER_LOGS=$(\
gcloud compute ssh \
${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--command="docker logs ${{ inputs.test_id }} --tail 200")
SYNC_HEIGHT=$(echo $DOCKER_LOGS | grep -oE '${{ inputs.height_grep_text }}[0-9]+' | grep -oE '[0-9]+' | tail -1 || [[ $? == 1 ]])
echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV
# Sets the $UPDATE_SUFFIX env var to "-u" if using cached state,
# Sets the $UPDATE_SUFFIX env var to "-u" if updating a previous cached state,
# and the empty string otherwise.
#
# Also sets a unique date and time suffix $TIME_SUFFIX.
@ -1152,21 +1135,92 @@ jobs:
run: |
UPDATE_SUFFIX=""
if [[ "${{ inputs.needs_zebra_state }}" == "true" ]]; then
if [[ "${{ inputs.needs_zebra_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "zebrad" ]]; then
UPDATE_SUFFIX="-u"
fi
# TODO: find a better logic for the lwd-full-sync case
if [[ "${{ inputs.needs_lwd_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "lightwalletd" ]] && [[ "${{ inputs.test_id }}" != 'lwd-full-sync' ]]; then
UPDATE_SUFFIX="-u"
fi
# We're going to delete old images after a month, so we don't need the year here
TIME_SUFFIX=$(date '+%m%d%H%M%S' --utc)
echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> $GITHUB_ENV
echo "TIME_SUFFIX=$TIME_SUFFIX" >> $GITHUB_ENV
# Get the sync height from the test logs, which is later used as part of the
# disk description and labels.
#
# The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }},
# this allows to dynamically change the height as needed by different situations or
# based on the logs output from different tests.
#
# If the sync height is missing from the logs, the job fails.
#
# Passes the sync height to subsequent steps using $SYNC_HEIGHT env variable.
- name: Get sync height from logs
run: |
SYNC_HEIGHT=""
DOCKER_LOGS=$( \
gcloud compute ssh \
${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--command=" \
docker logs ${{ inputs.test_id }} --tail 200 \
")
SYNC_HEIGHT=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
grep --extended-regexp --only-matching '[0-9]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$SYNC_HEIGHT" ]]; then
echo "Missing sync height in logs: $SYNC_HEIGHT"
# Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
# or the CI workflow sync height regex is wrong.
false
fi
echo "Found sync height in logs: $SYNC_HEIGHT"
echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV
# Get the original cached state height from google cloud.
#
# If the height is missing from the image labels, uses zero instead.
#
# TODO: fail the job if needs_zebra_state but the height is missing
# we can make this change after all the old images have been deleted, this should happen around 15 September 2022
# we'll also need to do a manual checkpoint rebuild before opening the PR for this change
#
# Passes the original height to subsequent steps using $ORIGINAL_HEIGHT env variable.
- name: Get original cached state height from google cloud
run: |
ORIGINAL_HEIGHT="0"
if [[ -n "${{ format('{0}', needs.setup-with-cached-state.outputs.cached_disk_name) }}" ]]; then
ORIGINAL_HEIGHT=$(gcloud compute images list --filter="name=${{ needs.setup-with-cached-state.outputs.cached_disk_name }}" --format="value(labels.height)")
ORIGINAL_HEIGHT=${ORIGINAL_HEIGHT:-0}
echo "$CACHED_DISK_NAME height: $ORIGINAL_HEIGHT"
fi
echo "ORIGINAL_HEIGHT=$ORIGINAL_HEIGHT" >> $GITHUB_ENV
# Create an image from the state disk, which will be used for any tests that start
# after it is created. These tests can be in the same workflow, or in a different PR.
#
# Using the newest image makes future jobs faster, because it is closer to the chain tip.
#
# Skips creating updated images if the original image is less than $CACHED_STATE_UPDATE_LIMIT behind the current tip.
# Full sync images are always created.
#
# The image can contain:
# - Zebra cached state, or
# - Zebra + lightwalletd cached state.
@ -1189,14 +1243,19 @@ jobs:
# used by the container.
- name: Create image from state disk
run: |
gcloud compute images create \
"${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
--force \
--source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
--source-disk-zone=${{ env.ZONE }} \
--storage-location=us \
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}" \
--labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},network=${{ env.NETWORK }},target-height=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]]; then
gcloud compute images create \
"${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
--force \
--source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
--source-disk-zone=${{ env.ZONE }} \
--storage-location=us \
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}" \
--labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},network=${{ env.NETWORK }},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},updated-from-height=${ORIGINAL_HEIGHT},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
else
echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT"
fi
# delete the Google Cloud instance for this test
delete-instance: