fix(ci): Write cached state images after update syncs, and use the latest image from any commit (#4949)

* Save cached state on full syncs and updates * Add an -update suffix to CI images created by updating cached state * Make disk image names unique by adding a time suffix * Use the latest image from any branch, but prefer the current commit if available * Document Zebra's continuous integration tests * Fix typos in environmental variable names * Expand documentation * Fix variable name typo * Fix shell syntax
2022-08-25 23:09:20 +10:00 · 2022-08-25 23:09:20 +10:00 · 0a39011b88
parent 7fc3cdd2b2
commit 0a39011b88
3 changed files with 88 additions and 22 deletions
--- a/.github/workflows/continous-integration-docker.yml
+++ b/.github/workflows/continous-integration-docker.yml
@ -330,11 +330,10 @@ jobs:
      test_description: Test syncing to tip with a Zebra tip state
      test_variables: '-e TEST_UPDATE_SYNC=1 -e ZEBRA_FORCE_USE_COLOR=1 -e ZEBRA_CACHED_STATE_DIR=/var/cache/zebrad-cache'
      needs_zebra_state: true
-      # TODO: do we want to update the disk on every PR, to increase CI speed?
-      saves_to_disk: false
+      # update the disk on every PR, to increase CI speed
+      saves_to_disk: true
      disk_suffix: tip
      root_state_path: '/var/cache'
-      # TODO: do we also want to test the `zebrad` part of the `lwd-cache`? (But not update it.)
      zebra_state_dir: 'zebrad-cache'

  # Test that Zebra can answer a synthetic RPC call, using a cached Zebra tip state
@ -403,7 +402,6 @@ jobs:
    # to also run on Mergify head branches,
    # add `|| (github.event_name == 'push' && startsWith(github.head_ref, 'mergify/merge-queue/'))`:
    # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-your-workflow-based-on-the-head-or-base-branch-of-a-pull-request-1
-    # TODO: this test is unreliable, in the meanwhile we'll only generate a new lwd cached state when a full sync is also triggered
    if: ${{ (!cancelled() && !failure() && github.event.inputs.regenerate-disks != 'true' && github.event.inputs.run-full-sync != 'true') || !fromJSON(needs.get-available-disks.outputs.lwd_tip_disk) }}
    with:
      app_name: lightwalletd
@ -438,8 +436,8 @@ jobs:
      test_variables: '-e TEST_LWD_UPDATE_SYNC=1 -e ZEBRA_TEST_LIGHTWALLETD=1 -e ZEBRA_FORCE_USE_COLOR=1 -e ZEBRA_CACHED_STATE_DIR=/var/cache/zebrad-cache -e LIGHTWALLETD_DATA_DIR=/var/cache/lwd-cache'
      needs_zebra_state: true
      needs_lwd_state: true
-      # TODO: do we want to update the disk on every PR, to increase CI speed?
-      saves_to_disk: false
+      # update the disk on every PR, to increase CI speed
+      saves_to_disk: true
      disk_prefix: lwd-cache
      disk_suffix: tip
      root_state_path: '/var/cache'
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@ -59,6 +59,12 @@ on:
        required: false
        type: boolean
        description: 'Does the test use Lightwalletd and Zebra cached state?'
+      # main branch states can be outdated and slower, but they can also be more reliable
+      prefer_main_cached_state:
+        required: false
+        type: boolean
+        default: false
+        description: 'Does the test prefer to use a main branch cached state?'
      saves_to_disk:
        required: true
        type: boolean
@ -259,7 +265,10 @@ jobs:
      # - To ${{ inputs.zebra_state_dir || inputs.disk_prefix }} if not
      #
      # If there are multiple disks:
-      # - prefer images generated from this branch, then the `main` branch, then any other branch
+      # - prefer images generated from this branch and commit, then
+      # - if prefer_main_cached_state is true, prefer images from the `main` branch, then
+      # - use images from any other branch.
+      # Within each of these categories:
      # - prefer newer images to older images
      #
      # Passes the disk name to subsequent steps using $CACHED_DISK_NAME env variable
@ -278,31 +287,46 @@ jobs:

          # Try to find an image generated from this branch and commit
          # Fields are listed in the "Create image from state disk" step
-          BRANCH_DISK_NAME="${DISK_PREFIX}-${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT}-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}"
-          CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${BRANCH_DISK_NAME}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
-          echo "${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT} Disk: $CACHED_DISK_NAME"
-
-          if [[ -z "$CACHED_DISK_NAME" ]]; then
-              # Try to find an image generated from the main branch
-              CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
-              echo "main Disk: $CACHED_DISK_NAME"
+          COMMIT_DISK_PREFIX="${DISK_PREFIX}-${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT}-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}"
+          COMMIT_CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${COMMIT_DISK_PREFIX}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
+          echo "${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT} Disk: $COMMIT_CACHED_DISK_NAME"
+          if [[ -n "$COMMIT_CACHED_DISK_NAME" ]]; then
+              echo "Description: $(gcloud compute images describe $COMMIT_CACHED_DISK_NAME --format='value(DESCRIPTION)')"
          fi

+          # Try to find an image generated from the main branch
+          MAIN_CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
+          echo "main Disk: $MAIN_CACHED_DISK_NAME"
+          if [[ -n "$MAIN_CACHED_DISK_NAME" ]]; then
+              echo "Description: $(gcloud compute images describe $MAIN_CACHED_DISK_NAME --format='value(DESCRIPTION)')"
+          fi
+
+          # Try to find an image generated from any other branch
+          ANY_CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
+          echo "any branch Disk: $ANY_CACHED_DISK_NAME"
+          if [[ -n "$ANY_CACHED_DISK_NAME" ]]; then
+              echo "Description: $(gcloud compute images describe $ANY_CACHED_DISK_NAME --format='value(DESCRIPTION)')"
+          fi
+
+          # Select a cached disk based on the job settings
+          CACHED_DISK_NAME="$COMMIT_CACHED_DISK_NAME"
+          if [[ -z "$CACHED_DISK_NAME" ]] && [[ "${{ inputs.prefer_main_cached_state }}" == "true" ]]; then
+              echo "Preferring main branch cached state to other branches..."
+              CACHED_DISK_NAME="$MAIN_CACHED_DISK_NAME"
+          fi
          if [[ -z "$CACHED_DISK_NAME" ]]; then
-              # Try to find an image generated from any other branch
-              CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
-              echo "any branch Disk: $CACHED_DISK_NAME"
+              CACHED_DISK_NAME="$ANY_CACHED_DISK_NAME"
          fi

          if [[ -z "$CACHED_DISK_NAME" ]]; then
              echo "No cached state disk available"
-              echo "Expected ${BRANCH_DISK_NAME}"
-              echo "Also searched for any commit on main, and any commit on any branch"
+              echo "Expected ${COMMIT_DISK_PREFIX}"
+              echo "Also searched for cached disks from other branches"
              echo "Cached state test jobs must depend on the cached state rebuild job"
              exit 1
          fi

-          echo "Description: $(gcloud compute images describe $CACHED_DISK_NAME --format='value(DESCRIPTION)')"
+          echo "Selected Disk: $CACHED_DISK_NAME"

          echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV
          echo "CACHED_DISK_NAME=$CACHED_DISK_NAME" >> $GITHUB_ENV
@ -956,6 +980,23 @@ jobs:
          SYNC_HEIGHT=$(echo $DOCKER_LOGS | grep -oE '${{ inputs.height_grep_text }}\([0-9]+\)' | grep -oE '[0-9]+' | tail -1 || [[ $? == 1 ]])
          echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV

+      # Sets the $UPDATE_SUFFIX env var to "-update" if using cached state,
+      # and the empty string otherwise.
+      #
+      # Also sets a unique date and time suffix $TIME_SUFFIX.
+      - name: Set update and time suffixes
+        run: |
+          UPDATE_SUFFIX=""
+
+          if [[ "${{ inputs.needs_zebra_state }}" == "true" ]]; then
+              UPDATE_SUFFIX="-update"
+          fi
+
+          TIME_SUFFIX=$(date '+%Y-%m-%d-%H-%M-%S' --utc)
+
+          echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> $GITHUB_ENV
+          echo "TIME_SUFFIX=$TIME_SUFFIX" >> $GITHUB_ENV
+
      # Create an image from disk that will be used for following/other tests
      # This image can contain:
      # - Zebra cached state
@ -966,7 +1007,8 @@ jobs:
      # used by the container
      - name: Create image from state disk
        run: |
-          gcloud compute images create ${{ inputs.disk_prefix }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }} \
+          gcloud compute images create \
+          "${{ inputs.disk_prefix }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }}$UPDATE_SUFFIX-$TIME_SUFFIX" \
          --force \
          --source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
          --source-disk-zone=${{ env.ZONE }} \
--- a/book/src/dev/continuous-integration.md
+++ b/book/src/dev/continuous-integration.md
@ -0,0 +1,26 @@
+# Zebra Continuous Integration
+
+Zebra has extensive continuous integration tests for node syncing and `lightwalletd` integration.
+
+On every PR change, Zebra runs [these Docker tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-docker.yml):
+- Zebra update syncs from a cached state Google Cloud tip image
+- lightwalletd full syncs from a cached state Google Cloud tip image
+- lightwalletd update syncs from a cached state Google Cloud tip image
+- lightwalletd integration with Zebra JSON-RPC and Light Wallet gRPC calls
+
+When a PR is merged to the `main` branch, we also run a Zebra full sync test from genesis.
+
+Some Docker tests are stateful, they can depend on:
+- built Zebra and `lightwalletd` docker images
+- cached state images in Google cloud
+- jobs that launch Google Cloud instances for each test
+- multiple jobs that follow the logs from Google Cloud (to work around the 6 hour GitHub actions limit)
+- a final "Run" job that checks the exit status of the Rust acceptance test
+
+To support this test state, some Docker tests depend on other tests finishing first.
+
+Currently, each Zebra and lightwalletd sync updates the cached images, which are shared by all tests.
+Tests prefer the latest image generated from the same branch and commit. But if they are not available, they will use the latest image from any branch and commit, as long as the state version is the same.
+
+Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners.
+