From 0a39011b8868d6275f546b71c0ccd4b0b03c58f0 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Thu, 25 Aug 2022 23:09:20 +1000
Subject: [PATCH] fix(ci): Write cached state images after update syncs, and
 use the latest image from any commit (#4949)

* Save cached state on full syncs and updates

* Add an -update suffix to CI images created by updating cached state

* Make disk image names unique by adding a time suffix

* Use the latest image from any branch, but prefer the current commit if available

* Document Zebra's continuous integration tests

* Fix typos in environmental variable names

* Expand documentation

* Fix variable name typo

* Fix shell syntax
---
 .../continous-integration-docker.yml          | 10 +--
 .github/workflows/deploy-gcp-tests.yml        | 74 +++++++++++++++----
 book/src/dev/continuous-integration.md        | 26 +++++++
 3 files changed, 88 insertions(+), 22 deletions(-)
 create mode 100644 book/src/dev/continuous-integration.md

diff --git a/.github/workflows/continous-integration-docker.yml b/.github/workflows/continous-integration-docker.yml
index 4b6a0e4f2..e15a46896 100644
--- a/.github/workflows/continous-integration-docker.yml
+++ b/.github/workflows/continous-integration-docker.yml
@@ -330,11 +330,10 @@ jobs:
       test_description: Test syncing to tip with a Zebra tip state
       test_variables: '-e TEST_UPDATE_SYNC=1 -e ZEBRA_FORCE_USE_COLOR=1 -e ZEBRA_CACHED_STATE_DIR=/var/cache/zebrad-cache'
       needs_zebra_state: true
-      # TODO: do we want to update the disk on every PR, to increase CI speed?
-      saves_to_disk: false
+      # update the disk on every PR, to increase CI speed
+      saves_to_disk: true
       disk_suffix: tip
       root_state_path: '/var/cache'
-      # TODO: do we also want to test the `zebrad` part of the `lwd-cache`? (But not update it.)
       zebra_state_dir: 'zebrad-cache'
 
   # Test that Zebra can answer a synthetic RPC call, using a cached Zebra tip state
@@ -403,7 +402,6 @@ jobs:
     # to also run on Mergify head branches,
     # add `|| (github.event_name == 'push' && startsWith(github.head_ref, 'mergify/merge-queue/'))`:
     # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-your-workflow-based-on-the-head-or-base-branch-of-a-pull-request-1
-    # TODO: this test is unreliable, in the meanwhile we'll only generate a new lwd cached state when a full sync is also triggered
     if: ${{ (!cancelled() && !failure() && github.event.inputs.regenerate-disks != 'true' && github.event.inputs.run-full-sync != 'true') || !fromJSON(needs.get-available-disks.outputs.lwd_tip_disk) }}
     with:
       app_name: lightwalletd
@@ -438,8 +436,8 @@ jobs:
       test_variables: '-e TEST_LWD_UPDATE_SYNC=1 -e ZEBRA_TEST_LIGHTWALLETD=1 -e ZEBRA_FORCE_USE_COLOR=1 -e ZEBRA_CACHED_STATE_DIR=/var/cache/zebrad-cache -e LIGHTWALLETD_DATA_DIR=/var/cache/lwd-cache'
       needs_zebra_state: true
       needs_lwd_state: true
-      # TODO: do we want to update the disk on every PR, to increase CI speed?
-      saves_to_disk: false
+      # update the disk on every PR, to increase CI speed
+      saves_to_disk: true
       disk_prefix: lwd-cache
       disk_suffix: tip
       root_state_path: '/var/cache'
diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 0cdea53bd..f6d54524d 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -59,6 +59,12 @@ on:
         required: false
         type: boolean
         description: 'Does the test use Lightwalletd and Zebra cached state?'
+      # main branch states can be outdated and slower, but they can also be more reliable
+      prefer_main_cached_state:
+        required: false
+        type: boolean
+        default: false
+        description: 'Does the test prefer to use a main branch cached state?'
       saves_to_disk:
         required: true
         type: boolean
@@ -259,7 +265,10 @@ jobs:
       # - To ${{ inputs.zebra_state_dir || inputs.disk_prefix }} if not
       #
       # If there are multiple disks:
-      # - prefer images generated from this branch, then the `main` branch, then any other branch
+      # - prefer images generated from this branch and commit, then
+      # - if prefer_main_cached_state is true, prefer images from the `main` branch, then
+      # - use images from any other branch.
+      # Within each of these categories:
       # - prefer newer images to older images
       #
       # Passes the disk name to subsequent steps using $CACHED_DISK_NAME env variable
@@ -278,31 +287,46 @@ jobs:
 
           # Try to find an image generated from this branch and commit
           # Fields are listed in the "Create image from state disk" step
-          BRANCH_DISK_NAME="${DISK_PREFIX}-${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT}-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}"
-          CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${BRANCH_DISK_NAME}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
-          echo "${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT} Disk: $CACHED_DISK_NAME"
-
-          if [[ -z "$CACHED_DISK_NAME" ]]; then
-              # Try to find an image generated from the main branch
-              CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
-              echo "main Disk: $CACHED_DISK_NAME"
+          COMMIT_DISK_PREFIX="${DISK_PREFIX}-${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT}-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}"
+          COMMIT_CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${COMMIT_DISK_PREFIX}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
+          echo "${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT} Disk: $COMMIT_CACHED_DISK_NAME"
+          if [[ -n "$COMMIT_CACHED_DISK_NAME" ]]; then
+              echo "Description: $(gcloud compute images describe $COMMIT_CACHED_DISK_NAME --format='value(DESCRIPTION)')"
           fi
 
+          # Try to find an image generated from the main branch
+          MAIN_CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
+          echo "main Disk: $MAIN_CACHED_DISK_NAME"
+          if [[ -n "$MAIN_CACHED_DISK_NAME" ]]; then
+              echo "Description: $(gcloud compute images describe $MAIN_CACHED_DISK_NAME --format='value(DESCRIPTION)')"
+          fi
+
+          # Try to find an image generated from any other branch
+          ANY_CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
+          echo "any branch Disk: $ANY_CACHED_DISK_NAME"
+          if [[ -n "$ANY_CACHED_DISK_NAME" ]]; then
+              echo "Description: $(gcloud compute images describe $ANY_CACHED_DISK_NAME --format='value(DESCRIPTION)')"
+          fi
+
+          # Select a cached disk based on the job settings
+          CACHED_DISK_NAME="$COMMIT_CACHED_DISK_NAME"
+          if [[ -z "$CACHED_DISK_NAME" ]] && [[ "${{ inputs.prefer_main_cached_state }}" == "true" ]]; then
+              echo "Preferring main branch cached state to other branches..."
+              CACHED_DISK_NAME="$MAIN_CACHED_DISK_NAME"
+          fi
           if [[ -z "$CACHED_DISK_NAME" ]]; then
-              # Try to find an image generated from any other branch
-              CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
-              echo "any branch Disk: $CACHED_DISK_NAME"
+              CACHED_DISK_NAME="$ANY_CACHED_DISK_NAME"
           fi
 
           if [[ -z "$CACHED_DISK_NAME" ]]; then
               echo "No cached state disk available"
-              echo "Expected ${BRANCH_DISK_NAME}"
-              echo "Also searched for any commit on main, and any commit on any branch"
+              echo "Expected ${COMMIT_DISK_PREFIX}"
+              echo "Also searched for cached disks from other branches"
               echo "Cached state test jobs must depend on the cached state rebuild job"
               exit 1
           fi
 
-          echo "Description: $(gcloud compute images describe $CACHED_DISK_NAME --format='value(DESCRIPTION)')"
+          echo "Selected Disk: $CACHED_DISK_NAME"
 
           echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV
           echo "CACHED_DISK_NAME=$CACHED_DISK_NAME" >> $GITHUB_ENV
@@ -956,6 +980,23 @@ jobs:
           SYNC_HEIGHT=$(echo $DOCKER_LOGS | grep -oE '${{ inputs.height_grep_text }}\([0-9]+\)' | grep -oE '[0-9]+' | tail -1 || [[ $? == 1 ]])
           echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV
 
+      # Sets the $UPDATE_SUFFIX env var to "-update" if using cached state,
+      # and the empty string otherwise.
+      #
+      # Also sets a unique date and time suffix $TIME_SUFFIX.
+      - name: Set update and time suffixes
+        run: |
+          UPDATE_SUFFIX=""
+
+          if [[ "${{ inputs.needs_zebra_state }}" == "true" ]]; then
+              UPDATE_SUFFIX="-update"
+          fi
+
+          TIME_SUFFIX=$(date '+%Y-%m-%d-%H-%M-%S' --utc)
+
+          echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> $GITHUB_ENV
+          echo "TIME_SUFFIX=$TIME_SUFFIX" >> $GITHUB_ENV
+
       # Create an image from disk that will be used for following/other tests
       # This image can contain:
       # - Zebra cached state
@@ -966,7 +1007,8 @@ jobs:
       # used by the container
       - name: Create image from state disk
         run: |
-          gcloud compute images create ${{ inputs.disk_prefix }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }} \
+          gcloud compute images create \
+          "${{ inputs.disk_prefix }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-${{ inputs.disk_suffix }}$UPDATE_SUFFIX-$TIME_SUFFIX" \
           --force \
           --source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
           --source-disk-zone=${{ env.ZONE }} \
diff --git a/book/src/dev/continuous-integration.md b/book/src/dev/continuous-integration.md
new file mode 100644
index 000000000..16089ff9d
--- /dev/null
+++ b/book/src/dev/continuous-integration.md
@@ -0,0 +1,26 @@
+# Zebra Continuous Integration
+
+Zebra has extensive continuous integration tests for node syncing and `lightwalletd` integration.
+
+On every PR change, Zebra runs [these Docker tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-docker.yml):
+- Zebra update syncs from a cached state Google Cloud tip image
+- lightwalletd full syncs from a cached state Google Cloud tip image
+- lightwalletd update syncs from a cached state Google Cloud tip image
+- lightwalletd integration with Zebra JSON-RPC and Light Wallet gRPC calls
+
+When a PR is merged to the `main` branch, we also run a Zebra full sync test from genesis.
+
+Some Docker tests are stateful, they can depend on:
+- built Zebra and `lightwalletd` docker images
+- cached state images in Google cloud
+- jobs that launch Google Cloud instances for each test
+- multiple jobs that follow the logs from Google Cloud (to work around the 6 hour GitHub actions limit)
+- a final "Run" job that checks the exit status of the Rust acceptance test
+
+To support this test state, some Docker tests depend on other tests finishing first.
+
+Currently, each Zebra and lightwalletd sync updates the cached images, which are shared by all tests.
+Tests prefer the latest image generated from the same branch and commit. But if they are not available, they will use the latest image from any branch and commit, as long as the state version is the same.
+
+Zebra also does [a smaller set of tests](https://github.com/ZcashFoundation/zebra/blob/main/.github/workflows/continous-integration-os.yml) on tier 2 platforms using GitHub actions runners.
+