fix(ci): Split Docker logs into sprout, other checkpoints, and full validation (#4704)

* Checkout zebra in each job to avoid warnings But put TODOs where we might be able to skip checkouts * Split log following into sprout checkpoints, sapling/orchard checkpoints, and full validation * Make job IDs shorter * Use /dev/stderr because docker doesn't have a tty * remove pipefail * Revert "remove pipefail" This reverts commit a7ee37bebdc107a4215e7dd307b189d925969234. * Make tee ignore errors writing to a grep pipe * Avoid launching multiple docker instances for duplicate jobs * Ignore broken pipe error messages and statuses * fix(ci): docker wait not finding container We had this issue before, I can't recall if this was a parsing error between GitHub Actions and gcloud `--command` parsing, but we had to change this into two pieces. This implementation keeps it how we did it before 9b9578c999/.github/workflows/test.yml (L235-L243) * docs: remove pending TODO We can't remove `actions/checkout` nor set `create_credentials_file` to `false` as next steps won't be able to authenticate to GCP. We can surely remove `actions/checkout` and leave `create_credentials_file` as `true`, but this will raise a warning on each step, and there's no benefit of doing so. * Show `docker wait` and `gcloud ssh` output * If `docker wait` fails, get the exit code using `docker inspect` Co-authored-by: Conrado Gouvea <conrado@zfnd.org> Co-authored-by: Gustavo Valverde <gustavo@iterativo.do> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2022-06-30 20:33:01 +10:00 · 2022-06-30 20:33:01 +10:00 · 67dc26fbb5
parent c8cdf0617c
commit 67dc26fbb5
1 changed files with 168 additions and 22 deletions
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@ -75,10 +75,19 @@ on:
        description: 'Application name for Google Cloud instance metadata'

 env:
+  # where we get the Docker image from
  IMAGE_NAME: zebrad-test
  GAR_BASE: us-docker.pkg.dev/zealous-zebra/zebra
+  # what kind of Google Cloud instance we want to launch
  ZONE: us-central1-a
  MACHINE_TYPE: c2d-standard-16
+  # How many previous log lines we show at the start of each new log job.
+  # Increase this number if some log lines are skipped between jobs
+  #
+  # We want to show all the logs since the last job finished,
+  # but we don't know how long it will be between jobs.
+  # 200 lines is about 6-15 minutes of sync logs, or one panic log.
+  EXTRA_LOG_LINES: 200

 jobs:
  # set up the test, if it doesn't use any cached state
@ -94,6 +103,7 @@ jobs:
      - uses: actions/checkout@v3.0.2
        with:
          persist-credentials: false
+          fetch-depth: '2'

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
@ -150,9 +160,8 @@ jobs:
  launch-without-cached-state:
    name: Launch ${{ inputs.test_id }} test
    needs: [ setup-without-cached-state ]
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
-    if: ${{ !cancelled() && !inputs.needs_zebra_state }}
+    # If creating the Google Cloud instance fails, we don't want to launch another docker instance.
+    if: ${{ !cancelled() && !failure() && !inputs.needs_zebra_state }}
    runs-on: ubuntu-latest
    permissions:
      contents: 'read'
@ -161,6 +170,7 @@ jobs:
      - uses: actions/checkout@v3.0.2
        with:
          persist-credentials: false
+          fetch-depth: '2'

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
@ -324,9 +334,8 @@ jobs:
  launch-with-cached-state:
    name: Launch ${{ inputs.test_id }} test
    needs: [ setup-with-cached-state ]
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
-    if: ${{ !cancelled() && inputs.needs_zebra_state }}
+    # If creating the Google Cloud instance fails, we don't want to launch another docker instance.
+    if: ${{ !cancelled() && !failure() && inputs.needs_zebra_state }}
    runs-on: ubuntu-latest
    permissions:
      contents: 'read'
@ -445,13 +454,12 @@ jobs:
          "


-  # follow the logs of the test we just launched
-  follow-logs:
-    name: Show logs for ${{ inputs.test_id }} test
-    needs: [ launch-with-cached-state, launch-without-cached-state ]
+  # follow the logs of the test we just launched, up to Sapling activation (or the test finishing)
+  logs-sprout:
+    name: Log ${{ inputs.test_id }} test (sprout)
    # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
+    needs: [ launch-with-cached-state, launch-without-cached-state ]
+    # If the previous job fails, we still want to show the logs.
    if: ${{ !cancelled() }}
    runs-on: ubuntu-latest
    permissions:
@ -461,6 +469,7 @@ jobs:
      - uses: actions/checkout@v3.0.2
        with:
          persist-credentials: false
+          fetch-depth: '2'

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
@ -481,8 +490,12 @@ jobs:
          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
          token_format: 'access_token'

-      # Show all the logs since the container launched
-      - name: Show logs for ${{ inputs.test_id }} test
+      # Show all the logs since the container launched,
+      # following until Sapling activation (or the test finishes).
+      #
+      # The log pipeline ignores the exit status of `docker logs`.
+      # Errors in the tests are caught by the final test status job.
+      - name: Show logs for ${{ inputs.test_id }} test (sprout)
        run: |
          gcloud compute ssh \
          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
@ -494,14 +507,127 @@ jobs:
          docker logs \
          --tail all \
          --follow \
-          ${{ inputs.test_id }} \
+          ${{ inputs.test_id }} | \
+          tee --output-error=exit /dev/stderr | \
+          grep --max-count=1 --extended-regexp --color=always \
+          '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \
          "

+  # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
+  # TODO: split out sapling logs when the mandatory checkpoint is above NU5 activation
+  logs-checkpoint:
+    name: Log ${{ inputs.test_id }} test (checkpoint)
+    needs: [ logs-sprout ]
+    # If the previous job fails, we still want to show the logs.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Show recent logs, following until the last checkpoint (or the test finishes)
+      - name: Show logs for ${{ inputs.test_id }} test (checkpoint)
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          docker logs \
+          --tail ${{ env.EXTRA_LOG_LINES }} \
+          --follow \
+          ${{ inputs.test_id }} | \
+          tee --output-error=exit /dev/stderr | \
+          grep --max-count=1 --extended-regexp --color=always \
+          '(verified final checkpoint)|(test result:.*finished in)' \
+          "
+
+  # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
+  logs-end:
+    name: Log ${{ inputs.test_id }} test (end)
+    needs: [ logs-checkpoint ]
+    # If the previous job fails, we still want to show the logs.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Show recent logs, following until the test finishes
+      - name: Show logs for ${{ inputs.test_id }} test (end)
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          docker logs \
+          --tail ${{ env.EXTRA_LOG_LINES }} \
+          --follow \
+          ${{ inputs.test_id }} | \
+          tee --output-error=exit /dev/stderr | \
+          grep --max-count=1 --extended-regexp --color=always \
+          'test result:.*finished in' \
+          "
+
+
  # wait for the result of the test
  test-result:
    # TODO: update the job name here, and in the branch protection rules
    name: Run ${{ inputs.test_id }} test
-    needs: [ follow-logs ]
+    needs: [ logs-end ]
    # If the previous job fails, we also want to run and fail this job,
    # so that the branch protection rule fails in Mergify and GitHub.
    if: ${{ !cancelled() }}
@ -513,6 +639,7 @@ jobs:
      - uses: actions/checkout@v3.0.2
        with:
          persist-credentials: false
+          fetch-depth: '2'

      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
@ -535,8 +662,12 @@ jobs:

      # Wait for the container to finish, then exit with the test's exit status.
      #
-      # `docker wait` prints the container exit status as a string, but we need to exit `ssh` with that status.
-      # `docker wait` can also wait for multiple containers, but we only ever wait for a single container.
+      # If the container has already finished, `docker wait` should return its status.
+      # But sometimes this doesn't work, so we use `docker inspect` as a fallback.
+      #
+      # `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command
+      # with that status.
+      # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
      - name: Result of ${{ inputs.test_id }} test
        run: |
          gcloud compute ssh \
@ -544,10 +675,15 @@ jobs:
          --zone ${{ env.ZONE }} \
          --quiet \
          --ssh-flag="-o ServerAliveInterval=5" \
-          --command \
-          "\
-          exit $(docker wait ${{ inputs.test_id }}) \
-          "
+          --command=' \
+          EXIT_STATUS=$( \
+          docker wait ${{ inputs.test_id }} || \
+          docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \
+          echo "missing container, or missing exit status for container" \
+          ); \
+          echo "docker exit status: $EXIT_STATUS"; \
+          exit "$EXIT_STATUS" \
+          '


  # create a state image from the instance's state disk, if requested by the caller
@ -563,6 +699,11 @@ jobs:
      contents: 'read'
      id-token: 'write'
    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
        with:
@ -650,6 +791,11 @@ jobs:
      contents: 'read'
      id-token: 'write'
    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4
        with: