zebra/.github/workflows/sub-deploy-integration-test...

name: Deploy Tests to GCP

on:
  workflow_call:
    inputs:
      # Status and logging
      test_id:
        required: true
        type: string
        description: 'Unique identifier for the test'
      test_description:
        required: true
        type: string
        description: 'Explains what the test does'
      height_grep_text:
        required: false
        type: string
        description: 'Regular expression to find the tip height in test logs, and add it to newly created cached state image metadata'

      # Test selection and parameters
      test_variables:
        required: true
        type: string
        description: 'Environmental variables used to select and configure the test'
      network:
        required: false
        type: string
        default: Mainnet
        description: 'Zcash network to test against'
      is_long_test:
        required: false
        type: boolean
        default: false
        description: 'Does this test need multiple run jobs? (Does it run longer than 6 hours?)'

      # Cached state
      #
      # TODO: find a better name
      root_state_path:
        required: false
        type: string
        default: '/zebrad-cache'
        description: 'Cached state base directory path'
      # TODO: find a better name
      zebra_state_dir:
        required: false
        type: string
        default: ''
        description: 'Zebra cached state directory and input image prefix to search in GCP'
      # TODO: find a better name
      lwd_state_dir:
        required: false
        type: string
        default: ''
        description: 'Lightwalletd cached state directory and input image prefix to search in GCP'
      disk_prefix:
        required: false
        type: string
        default: 'zebrad-cache'
        description: 'Image name prefix, and `zebra_state_dir` name for newly created cached states'
      disk_suffix:
        required: false
        type: string
        description: 'Image name suffix'
      needs_zebra_state:
        required: true
        type: boolean
        description: 'Does the test use Zebra cached state?'
      needs_lwd_state:
        required: false
        type: boolean
        description: 'Does the test use Lightwalletd and Zebra cached state?'
      # main branch states can be outdated and slower, but they can also be more reliable
      prefer_main_cached_state:
        required: false
        type: boolean
        default: false
        description: 'Does the test prefer to use a main branch cached state?'
      saves_to_disk:
        required: true
        type: boolean
        description: 'Can this test create new or updated cached state disks?'
      force_save_to_disk:
        required: false
        type: boolean
        default: false
        description: 'Force this test to create a new or updated cached state disk'
      app_name:
        required: false
        type: string
        default: 'zebra'
        description: 'Application name, used to work out when a job is an update job'

env:
  # How many previous log lines we show at the start of each new log job.
  # Increase this number if some log lines are skipped between jobs
  #
  # We want to show all the logs since the last job finished,
  # but we don't know how long it will be between jobs.
  # 200 lines is about 6-15 minutes of sync logs, or one panic log.
  EXTRA_LOG_LINES: 200
  # How many blocks to wait before creating an updated cached state image.
  # 1 day is approximately 1152 blocks.
  CACHED_STATE_UPDATE_LIMIT: 576

jobs:
  # Show all the test logs, then follow the logs of the test we just launched, until it finishes.
  # Then check the result of the test.
  #
  # If `inputs.is_long_test` is `true`, the timeout is 5 days, otherwise it's 3 hours.
  test-result:
    name: Run ${{ inputs.test_id }} test
    runs-on: zfnd-runners
    timeout-minutes: ${{ inputs.is_long_test && 7200 || 180 }}
    outputs:
      cached_disk_name: ${{ steps.get-disk-name.outputs.cached_disk_name }}
    permissions:
      contents: 'read'
      id-token: 'write'
    steps:
      - uses: actions/checkout@v4.1.1
        with:
          persist-credentials: false
          fetch-depth: '2'
      - uses: r7kamura/rust-problem-matchers@v1.4.0

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
        with:
          short-length: 7

      - name: Downcase network name for disks and labels
        run: |
          NETWORK_CAPS="${{ inputs.network }}"
          echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"

      # Install our SSH secret
      - name: Install private SSH key
        uses: shimataro/ssh-key-action@v2.7.0
        with:
          key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
          name: google_compute_engine
          known_hosts: unnecessary

      - name: Generate public SSH key
        run: |
          sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
          ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub

      # Setup gcloud CLI
      - name: Authenticate to Google Cloud
        id: auth
        uses: google-github-actions/auth@v2.1.1
        with:
          retries: '3'
          workload_identity_provider: '${{ vars.GCP_WIF }}'
          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.0

      # Find a cached state disk for this job, matching all of:
      # - disk cached state (lwd_state_dir/zebra_state_dir or disk_prefix) - zebrad-cache or lwd-cache
      # - state version (from the source code) - v{N}
      # - network (network) - mainnet or testnet
      # - disk target height kind (disk_suffix) - checkpoint or tip
      #
      # If the test needs a lightwalletd state (needs_lwd_state) set the variable DISK_PREFIX accordingly
      # - To ${{ inputs.lwd_state_dir }}" if needed
      # - To ${{ inputs.zebra_state_dir || inputs.disk_prefix }} if not
      #
      # If there are multiple disks:
      # - prefer images generated from the same commit, then
      # - if prefer_main_cached_state is true, prefer images from the `main` branch, then
      # - use any images from any other branch or commit.
      # Within each of these categories:
      # - prefer newer images to older images
      #
      # Passes the disk name to subsequent steps using $CACHED_DISK_NAME env variable
      # Passes the state version to subsequent steps using $STATE_VERSION env variable
      #
      # TODO: move this script into a file, and call it from sub-find-cached-disks.yml as well.
      - name: Find ${{ inputs.test_id }} cached state disk
        id: get-disk-name
        if: ${{ inputs.needs_zebra_state || inputs.needs_lwd_state }}
        env:
          GITHUB_SHA_SHORT: ${{ env.GITHUB_SHA_SHORT }}
          NEEDS_LWD_STATE: ${{ inputs.needs_lwd_state }}
          LWD_STATE_DIR: ${{ inputs.lwd_state_dir }}
          ZEBRA_STATE_DIR: ${{ inputs.zebra_state_dir }}
          DISK_PREFIX: ${{ inputs.disk_prefix }}
          NETWORK: ${{ env.NETWORK }} # use lowercase version from env, not input
          DISK_SUFFIX: ${{ inputs.disk_suffix }}
          PREFER_MAIN_CACHED_STATE: ${{ inputs.prefer_main_cached_state }}
        run: |
          source ./.github/workflows/scripts/gcp-get-cached-disks.sh
          echo "STATE_VERSION=${LOCAL_STATE_VERSION}" >> "${GITHUB_ENV}"
          echo "CACHED_DISK_NAME=${CACHED_DISK_NAME}" >> "${GITHUB_ENV}"
          echo "cached_disk_name=${CACHED_DISK_NAME}" >> "${GITHUB_OUTPUT}"

      # Create a Compute Engine virtual machine and attach a cached state disk using the
      # $CACHED_DISK_NAME variable as the source image to populate the disk cached state
      # if the test needs it.
      - name: Create ${{ inputs.test_id }} GCP compute instance
        id: create-instance
        shell: /usr/bin/bash -x {0}
        run: |
          NAME="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}"
          DISK_PARAMS="size=400GB,type=pd-ssd,name=${NAME},device-name=${NAME}"
          if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
            DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
          fi
          gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
          --boot-disk-size 50GB \
          --boot-disk-type pd-ssd \
          --image-project=cos-cloud \
          --image-family=cos-stable \
          --create-disk="${DISK_PARAMS}" \
          --container-image=gcr.io/google-containers/busybox \
          --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
          --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
          --scopes cloud-platform \
          --metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE \
          --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \
          --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
          --tags ${{ inputs.app_name }} \
          --zone ${{ vars.GCP_ZONE }}

      # Format the mounted disk if the test doesn't use a cached state.
      - name: Format ${{ inputs.test_id }} volume
        if: ${{ !inputs.needs_zebra_state && !inputs.needs_lwd_state }}
        shell: /usr/bin/bash -ex {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          set -ex;
          # Extract the correct disk name based on the device-name
          DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-);
          sudo mkfs.ext4 -v /dev/$DISK_NAME \
          '

      # Launch the test with the previously created disk or cached state.
      #
      # This step uses a $MOUNT_FLAGS variable to mount the disk to the docker container.
      # If the test needs Lightwalletd state, we add the Lightwalletd state mount to the $MOUNT_FLAGS variable.
      #
      # SSH into the just created VM, and create a Docker container to run the incoming test
      # from ${{ inputs.test_id }}, then mount the sudo docker volume created in the previous job.
      #
      # In this step we're using the same disk for simplicity, as mounting multiple disks to the
      # VM and to the container might require more steps in this workflow, and additional
      # considerations.
      #
      # The disk mounted in the VM is located at /dev/$DISK_NAME, we mount the root `/` of this disk to the docker
      # container, and might have two different paths (if lightwalletd state is needed):
      # - /var/cache/zebrad-cache -> ${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} -> $ZEBRA_CACHED_STATE_DIR
      # - /var/cache/lwd-cache -> ${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} -> $LIGHTWALLETD_DATA_DIR
      #
      # Currently we do this by mounting the same disk at both paths.
      #
      # This doesn't cause any path conflicts, because Zebra and lightwalletd create different
      # subdirectories for their data. (But Zebra, lightwalletd, and the test harness must not
      # delete the whole cache directory.)
      #
      # These paths must match the variables used by the tests in Rust, which are also set in
      # `ci-unit-tests-docker.yml` to be able to run this tests.
      #
      # Although we're mounting the disk root to both directories, Zebra and Lightwalletd
      # will only respect the values from $ZEBRA_CACHED_STATE_DIR and $LIGHTWALLETD_DATA_DIR,
      # the inputs like ${{ inputs.zebra_state_dir }} and ${{ inputs.lwd_state_dir }}
      # are only used to match those variables paths.
      - name: Launch ${{ inputs.test_id }} test
        id: launch-test
        shell: /usr/bin/bash -x {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \

          # Extract the correct disk name based on the device-name
          DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-)

          MOUNT_FLAGS="--mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }}"

          # Check if we need to mount for Lightwalletd state
          # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
          if [[ "${{ inputs.needs_lwd_state }}" == "true" || "${{ inputs.test_id }}" == "lwd-full-sync" ]]; then
            MOUNT_FLAGS="$MOUNT_FLAGS --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }}"
          fi

          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
          ${MOUNT_FLAGS} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
          '

      # Show debug logs if previous job failed
      - name: Show debug logs if previous job failed
        if: ${{ failure() }}
        shell: /usr/bin/bash -x {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          lsblk;
          sudo lsof /dev/$DISK_NAME;
          sudo dmesg;
          sudo journalctl -b \
          '

      # Show all the logs since the container launched,
      # following until we see zebrad startup messages.
      #
      # This check limits the number of log lines, so tests running on the wrong network don't
      # run until the job timeout. If Zebra does a complete recompile, there are a few hundred log
      # lines before the startup logs. So that's what we use here.
      #
      # The log pipeline ignores the exit status of `docker logs`.
      # It also ignores the expected 'broken pipe' error from `tee`,
      # which happens when `grep` finds a matching output and moves on to the next job.
      #
      # Errors in the tests are caught by the final test status job.
      - name: Check startup logs for ${{ inputs.test_id }}
        shell: /usr/bin/bash -x {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          sudo docker logs \
          --tail all \
          --follow \
          ${{ inputs.test_id }} | \
          head -700 | \
          tee --output-error=exit-nopipe /dev/stderr | \
          grep --max-count=1 --extended-regexp --color=always \
          "Zcash network: ${{ inputs.network }}"; \
          '

      # Check that the container executed at least 1 Rust test harness test, and that all tests passed.
      # Then wait for the container to finish, and exit with the test's exit status.
      # Also shows all the test logs.
      #
      # If the container has already finished, `docker wait` should return its status.
      # But sometimes this doesn't work, so we use `docker inspect` as a fallback.
      #
      # `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command
      # with that status.
      # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
      - name: Result of ${{ inputs.test_id }} test
        shell: /usr/bin/bash -x {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          sudo docker logs \
          --tail all \
          --follow \
          ${{ inputs.test_id }} | \
          tee --output-error=exit-nopipe /dev/stderr | \
          grep --max-count=1 --extended-regexp --color=always \
          "test result: .*ok.* [1-9][0-9]* passed.*finished in";
          LOGS_EXIT_STATUS=$?;

          EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status");
          echo "sudo docker exit status: $EXIT_STATUS";

          # If grep found the pattern, exit with the Docker container"s exit status
          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
              exit $EXIT_STATUS;
          fi

          # Handle other potential errors here
          echo "An error occurred while processing the logs.";
          exit 1; \
          '

  # create a state image from the instance's state disk, if requested by the caller
  create-state-image:
    name: Create ${{ inputs.test_id }} cached state image
    runs-on: ubuntu-latest
    needs: [ test-result ]
    # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
    # Normally, if a job is skipped, all the jobs that depend on it are also skipped.
    # So we need to override the default success() check to make this job run.
    if: ${{ !cancelled() && !failure() && (inputs.saves_to_disk || inputs.force_save_to_disk) }}
    permissions:
      contents: 'read'
      id-token: 'write'
    steps:
      - uses: actions/checkout@v4.1.1
        with:
          persist-credentials: false
          fetch-depth: '2'
      - uses: r7kamura/rust-problem-matchers@v1.4.0

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
        with:
          short-length: 7

      # Performs formatting on disk name components.
      #
      # Disk images in GCP are required to be in lowercase, but the blockchain network
      # uses sentence case, so we need to downcase ${{ inputs.network }}.
      #
      # Disk image names in GCP are limited to 63 characters, so we need to limit
      # branch names to 12 characters.
      #
      # Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable.
      # Passes ${{ env.GITHUB_REF_SLUG_URL }} to subsequent steps using $SHORT_GITHUB_REF env variable.
      - name: Format network name and branch name for disks
        run: |
          NETWORK_CAPS="${{ inputs.network }}"
          echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
          LONG_GITHUB_REF="${{ env.GITHUB_REF_SLUG_URL }}"
          echo "SHORT_GITHUB_REF=${LONG_GITHUB_REF:0:12}" >> "$GITHUB_ENV"

      # Install our SSH secret
      - name: Install private SSH key
        uses: shimataro/ssh-key-action@v2.7.0
        with:
          key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
          name: google_compute_engine
          known_hosts: unnecessary

      - name: Generate public SSH key
        run: |
          sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
          ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub

      # Setup gcloud CLI
      - name: Authenticate to Google Cloud
        id: auth
        uses: google-github-actions/auth@v2.1.1
        with:
          workload_identity_provider: '${{ vars.GCP_WIF }}'
          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.0

      # Get the state version from the local constants.rs file to be used in the image creation,
      # as the state version is part of the disk image name.
      #
      # Passes the state version to subsequent steps using $STATE_VERSION env variable
      - name: Get state version from constants.rs
        run: |
          LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" $GITHUB_WORKSPACE/zebra-state/src/constants.rs | grep -oE "[0-9]+" | tail -n1)
          echo "STATE_VERSION: $LOCAL_STATE_VERSION"

          echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> "$GITHUB_ENV"

      # Sets the $UPDATE_SUFFIX env var to "-u" if updating a previous cached state,
      # and the empty string otherwise.
      #
      # Also sets a unique date and time suffix $TIME_SUFFIX.
      - name: Set update and time suffixes
        run: |
          UPDATE_SUFFIX=""

          if [[ "${{ inputs.needs_zebra_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "zebrad" ]]; then
              UPDATE_SUFFIX="-u"
          fi

          # TODO: find a better logic for the lwd-full-sync case
          if [[ "${{ inputs.needs_lwd_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "lightwalletd" ]] && [[ "${{ inputs.test_id }}" != 'lwd-full-sync' ]]; then
              UPDATE_SUFFIX="-u"
          fi

          # We're going to delete old images after a few days, so we only need the time here
          TIME_SUFFIX=$(date '+%H%M%S' --utc)

          echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> "$GITHUB_ENV"
          echo "TIME_SUFFIX=$TIME_SUFFIX" >> "$GITHUB_ENV"

      # Get the full initial and running database versions from the test logs.
      # These versions are used as part of the disk description and labels.
      #
      # If these versions are missing from the logs, the job fails.
      #
      # Typically, the database versions are around line 20 in the logs..
      # But we check the first 1000 log lines, just in case the test harness recompiles all the
      # dependencies before running the test. (This can happen if the cache is invalid.)
      #
      # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
      # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
      - name: Get database versions from logs
        shell: /usr/bin/bash -x {0}
        run: |
          INITIAL_DISK_DB_VERSION=""
          RUNNING_DB_VERSION=""
          DB_VERSION_SUMMARY=""

          DOCKER_LOGS=$( \
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          sudo docker logs ${{ inputs.test_id }} | head -1000 \
          ')

          # either a semantic version or "creating new database"
          INITIAL_DISK_DB_VERSION=$( \
          echo "$DOCKER_LOGS" | \
          grep --extended-regexp --only-matching 'initial disk state version: [0-9a-z\.]+' | \
          grep --extended-regexp --only-matching '[0-9a-z\.]+' | \
          tail -1 || \
          [[ $? == 1 ]] \
          )

          if [[ -z "$INITIAL_DISK_DB_VERSION" ]]; then
              echo "Checked logs:"
              echo ""
              echo "$DOCKER_LOGS"
              echo ""
              echo "Missing initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
              # Fail the tests, because Zebra didn't log the initial disk database version,
              # or the regex in this step is wrong.
              false
          fi

          if [[ "$INITIAL_DISK_DB_VERSION" = "creating.new.database" ]]; then
              INITIAL_DISK_DB_VERSION="new"
          else
              INITIAL_DISK_DB_VERSION="v${INITIAL_DISK_DB_VERSION//./-}"
          fi

          echo "Found initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
          echo "INITIAL_DISK_DB_VERSION=$INITIAL_DISK_DB_VERSION" >> "$GITHUB_ENV"

          RUNNING_DB_VERSION=$( \
          echo "$DOCKER_LOGS" | \
          grep --extended-regexp --only-matching 'running state version: [0-9\.]+' | \
          grep --extended-regexp --only-matching '[0-9\.]+' | \
          tail -1 || \
          [[ $? == 1 ]] \
          )

          if [[ -z "$RUNNING_DB_VERSION" ]]; then
              echo "Checked logs:"
              echo ""
              echo "$DOCKER_LOGS"
              echo ""
              echo "Missing running database version in logs: $RUNNING_DB_VERSION"
              # Fail the tests, because Zebra didn't log the running database version,
              # or the regex in this step is wrong.
              false
          fi

          RUNNING_DB_VERSION="v${RUNNING_DB_VERSION//./-}"
          echo "Found running database version in logs: $RUNNING_DB_VERSION"
          echo "RUNNING_DB_VERSION=$RUNNING_DB_VERSION" >> "$GITHUB_ENV"

          if [[ "$INITIAL_DISK_DB_VERSION" = "$RUNNING_DB_VERSION" ]]; then
              DB_VERSION_SUMMARY="$RUNNING_DB_VERSION"
          elif [[ "$INITIAL_DISK_DB_VERSION" = "new" ]]; then
              DB_VERSION_SUMMARY="$RUNNING_DB_VERSION in new database"
          else
              DB_VERSION_SUMMARY="$INITIAL_DISK_DB_VERSION changing to $RUNNING_DB_VERSION"
          fi

          echo "Summarised database versions from logs: $DB_VERSION_SUMMARY"
          echo "DB_VERSION_SUMMARY=$DB_VERSION_SUMMARY" >> "$GITHUB_ENV"

      # Get the sync height from the test logs, which is later used as part of the
      # disk description and labels.
      #
      # The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }},
      # this allows to dynamically change the height as needed by different situations or
      # based on the logs output from different tests.
      #
      # If the sync height is missing from the logs, the job fails.
      #
      # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
      - name: Get sync height from logs
        shell: /usr/bin/bash -x {0}
        run: |
          SYNC_HEIGHT=""

          DOCKER_LOGS=$( \
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
          sudo docker logs ${{ inputs.test_id }} --tail 200 \
          ')

          SYNC_HEIGHT=$( \
          echo "$DOCKER_LOGS" | \
          grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
          grep --extended-regexp --only-matching '[0-9]+' | \
          tail -1 || \
          [[ $? == 1 ]] \
          )

          if [[ -z "$SYNC_HEIGHT" ]]; then
              echo "Checked logs:"
              echo ""
              echo "$DOCKER_LOGS"
              echo ""
              echo "Missing sync height in logs: $SYNC_HEIGHT"
              # Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
              # or the CI workflow sync height regex is wrong.
              false
          fi

          echo "Found sync height in logs: $SYNC_HEIGHT"
          echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> "$GITHUB_ENV"

      # Get the original cached state height from google cloud.
      #
      # If the height is missing from the image labels, uses zero instead.
      #
      # TODO: fail the job if needs_zebra_state but the height is missing
      #       we can make this change after all the old images have been deleted, this should happen around 15 September 2022
      #       we'll also need to do a manual checkpoint rebuild before opening the PR for this change
      #
      # Passes the original height to subsequent steps using $ORIGINAL_HEIGHT env variable.
      - name: Get original cached state height from google cloud
        run: |
          ORIGINAL_HEIGHT="0"
          ORIGINAL_DISK_NAME="${{ format('{0}', needs.test-result.outputs.cached_disk_name) }}"

          if [[ -n "$ORIGINAL_DISK_NAME" ]]; then
              ORIGINAL_HEIGHT=$(gcloud compute images list --filter="status=READY AND name=$ORIGINAL_DISK_NAME" --format="value(labels.height)")
              ORIGINAL_HEIGHT=${ORIGINAL_HEIGHT:-0}
              echo "$ORIGINAL_DISK_NAME height: $ORIGINAL_HEIGHT"
          else
              ORIGINAL_DISK_NAME="new-disk"
              echo "newly created disk, original height set to 0"
          fi

          echo "ORIGINAL_HEIGHT=$ORIGINAL_HEIGHT" >> "$GITHUB_ENV"
          echo "ORIGINAL_DISK_NAME=$ORIGINAL_DISK_NAME" >> "$GITHUB_ENV"

      # Create an image from the state disk, which will be used for any tests that start
      # after it is created. These tests can be in the same workflow, or in a different PR.
      #
      # Using the newest image makes future jobs faster, because it is closer to the chain tip.
      #
      # Skips creating updated images if the original image is less than $CACHED_STATE_UPDATE_LIMIT behind the current tip.
      # Full sync images are always created.
      #
      # The image can contain:
      # - Zebra cached state, or
      # - Zebra + lightwalletd cached state.
      # Which cached state is being saved to the disk is defined by ${{ inputs.disk_prefix }}.
      #
      # Google Cloud doesn't have an atomic image replacement operation.
      # We don't want to delete and re-create the image, because that causes a ~5 minute
      # window where might be no recent image. So we add an extra image with a unique name,
      # which gets selected because it has a later creation time.
      # This also simplifies the process of deleting old images,
      # because we don't have to worry about accidentally deleting all the images.
      #
      # The timestamp makes images from the same commit unique,
      # as long as they don't finish in the same second.
      # (This is unlikely, because each image created by a workflow has a different name.)
      #
      # The image name must also be 63 characters or less.
      #
      # Force the image creation (--force) as the disk is still attached even though is not being
      # used by the container.
      - name: Create image from state disk
        run: |
          MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
          if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]] || [[ "${{ inputs.force_save_to_disk }}" == "true" ]]; then
             gcloud compute images create \
              "${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${NETWORK}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
              --force \
              --source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
              --source-disk-zone=${{ vars.GCP_ZONE }} \
              --storage-location=us \
              --description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }} and database format ${{ env.DB_VERSION_SUMMARY }}" \
              --labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},state-running-version=${RUNNING_DB_VERSION},initial-state-disk-version=${INITIAL_DISK_DB_VERSION},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},force-save=${{ inputs.force_save_to_disk }},updated-from-height=${ORIGINAL_HEIGHT},updated-from-disk=${ORIGINAL_DISK_NAME},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
          else
              echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT of $ORIGINAL_DISK_NAME"
          fi

  # delete the Google Cloud instance for this test
  delete-instance:
    name: Delete ${{ inputs.test_id }} instance
    runs-on: ubuntu-latest
    needs: [ create-state-image ]
    # If a disk generation step timeouts (+6 hours) the previous job (creating the image) will be skipped.
    # Even if the instance continues running, no image will be created, so it's better to delete it.
    if: always()
    continue-on-error: true
    permissions:
      contents: 'read'
      id-token: 'write'
    steps:
      - uses: actions/checkout@v4.1.1
        with:
          persist-credentials: false
          fetch-depth: '2'
      - uses: r7kamura/rust-problem-matchers@v1.4.0

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
        with:
          short-length: 7

      # Setup gcloud CLI
      - name: Authenticate to Google Cloud
        id: auth
        uses: google-github-actions/auth@v2.1.1
        with:
          workload_identity_provider: '${{ vars.GCP_WIF }}'
          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.0

      # Deletes the instances that has been recently deployed in the actual commit after all
      # previous jobs have run, no matter the outcome of the job.
      - name: Delete test instance
        continue-on-error: true
        run: |
          INSTANCE=$(gcloud compute instances list --filter=${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)')
          if [ -z "${INSTANCE}" ]; then
            echo "No instance to delete"
          else
            gcloud compute instances delete "${INSTANCE}" --zone "${{ vars.GCP_ZONE }}" --delete-disks all --quiet
          fi