refactor(ci): Split `docker run` into launch, `logs`, and `wait` (#4690)

* Put arguments to "docker run" on different lines And update some comments. * Split docker run into launch, logs, and wait * Remove mistaken "needs state" condition on log and results job * Exit the ssh and the job with the container test's exit status
2022-06-28 10:36:18 +10:00 · 2022-06-28 10:36:18 +10:00 · cbd703b3fc
parent 6b22794396
commit cbd703b3fc
1 changed files with 143 additions and 21 deletions
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@ -81,7 +81,7 @@ env:
  MACHINE_TYPE: c2d-standard-16

 jobs:
-  # set up the test without any cached state
+  # set up the test, if it doesn't use any cached state
  # each test runs one of the *-with/without-cached-state job series, and skips the other
  setup-without-cached-state:
    name: Setup ${{ inputs.test_id }} test
@ -146,11 +146,12 @@ jobs:
          ${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
          "

-  test-without-cached-state:
-    name: Run ${{ inputs.test_id }} test
+  # launch the test, if it doesn't use any cached state
+  launch-without-cached-state:
+    name: Launch ${{ inputs.test_id }} test
    needs: [ setup-without-cached-state ]
-    # if the previous step fails, we also want to run and fail this step,
-    # so that the branch protection rule fails in Mergify and GitHub
+    # If the previous job fails, we also want to run and fail this job,
+    # so that the branch protection rule fails in Mergify and GitHub.
    if: ${{ !cancelled() && !inputs.needs_zebra_state }}
    runs-on: ubuntu-latest
    permissions:
@ -180,7 +181,8 @@ jobs:
          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
          token_format: 'access_token'

-      - name: Run ${{ inputs.test_id }} test
+      # Launch the test without any cached state
+      - name: Launch ${{ inputs.test_id }} test
        run: |
          gcloud compute ssh \
          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
@ -189,12 +191,17 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --command \
          "\
-          docker run ${{ inputs.test_variables }} -t --name ${{ inputs.test_id }} \
+          docker run \
+          --name ${{ inputs.test_id }} \
+          --tty \
+          --detach \
+          ${{ inputs.test_variables }} \
          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          ${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
          "

-  # set up the test using cached state
+
+  # set up the test, if it uses cached state
  # each test runs one of the *-with/without-cached-state job series, and skips the other
  setup-with-cached-state:
    name: Setup ${{ inputs.test_id }} test
@ -313,11 +320,12 @@ jobs:
          ${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
          "

-  test-with-cached-state:
-    name: Run ${{ inputs.test_id }} test
+  # launch the test, if it uses cached state
+  launch-with-cached-state:
+    name: Launch ${{ inputs.test_id }} test
    needs: [ setup-with-cached-state ]
-    # if the previous step fails, we also want to run and fail this step,
-    # so that the branch protection rule fails in Mergify and GitHub
+    # If the previous job fails, we also want to run and fail this job,
+    # so that the branch protection rule fails in Mergify and GitHub.
    if: ${{ !cancelled() && inputs.needs_zebra_state }}
    runs-on: ubuntu-latest
    permissions:
@ -348,8 +356,8 @@ jobs:
          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
          token_format: 'access_token'

-      # Run the test with the previously created Zebra-only cached state.
-      # Each test runs one of the "Run test" steps, and skips the other.
+      # Launch the test with the previously created Zebra-only cached state.
+      # Each test runs one of the "Launch test" steps, and skips the other.
      #
      # SSH into the just created VM, and create a Docker container to run the incoming test
      # from ${{ inputs.test_id }}, then mount the docker volume created in the previous job.
@ -364,7 +372,7 @@ jobs:
      # Although we're mounting the disk root, Zebra will only respect the values from
      # $ZEBRA_CACHED_STATE_DIR. The inputs like ${{ inputs.zebra_state_dir }} are only used
      # to match that variable paths.
-      - name: Run ${{ inputs.test_id }} test
+      - name: Launch ${{ inputs.test_id }} test
        # This step only runs for tests that just read or write a Zebra state.
        #
        # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
@ -378,13 +386,17 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --command \
          "\
-          docker run ${{ inputs.test_variables }} -t --name ${{ inputs.test_id }} \
+          docker run \
+          --name ${{ inputs.test_id }} \
+          --tty \
+          --detach \
+          ${{ inputs.test_variables }} \
          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          ${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
          "

-      # Run the test with the previously created Lightwalletd and Zebra cached state.
-      # Each test runs one of the "Run test" steps, and skips the other.
+      # Launch the test with the previously created Lightwalletd and Zebra cached state.
+      # Each test runs one of the "Launch test" steps, and skips the other.
      #
      # SSH into the just created VM, and create a Docker container to run the incoming test
      # from ${{ inputs.test_id }}, then mount the docker volume created in the previous job.
@ -408,7 +420,7 @@ jobs:
      # Although we're mounting the disk root to both directories, Zebra and Lightwalletd
      # will only respect the values from $ZEBRA_CACHED_STATE_DIR and $LIGHTWALLETD_DATA_DIR,
      # the inputs like ${{ inputs.lwd_state_dir }} are only used to match those variables paths.
-      - name: Run ${{ inputs.test_id }} test
+      - name: Launch ${{ inputs.test_id }} test
        # This step only runs for tests that read or write Lightwalletd and Zebra states.
        #
        # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
@ -422,20 +434,130 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --command \
          "\
-          docker run ${{ inputs.test_variables }} -t --name ${{ inputs.test_id }} \
+          docker run \
+          --name ${{ inputs.test_id }} \
+          --tty \
+          --detach \
+          ${{ inputs.test_variables }} \
          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
          ${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
          "

+
+  # follow the logs of the test we just launched
+  follow-logs:
+    name: Show logs for ${{ inputs.test_id }} test
+    needs: [ launch-with-cached-state, launch-without-cached-state ]
+    # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
+    # If the previous job fails, we also want to run and fail this job,
+    # so that the branch protection rule fails in Mergify and GitHub.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Show all the logs since the container launched
+      - name: Show logs for ${{ inputs.test_id }} test
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          docker logs \
+          --tail all \
+          --follow \
+          ${{ inputs.test_id }} \
+          "
+
+  # wait for the result of the test
+  test-result:
+    # TODO: update the job name here, and in the branch protection rules
+    name: Run ${{ inputs.test_id }} test
+    needs: [ follow-logs ]
+    # If the previous job fails, we also want to run and fail this job,
+    # so that the branch protection rule fails in Mergify and GitHub.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Wait for the container to finish, then exit with the test's exit status.
+      #
+      # `docker wait` prints the container exit status as a string, but we need to exit `ssh` with that status.
+      # `docker wait` can also wait for multiple containers, but we only ever wait for a single container.
+      - name: Result of ${{ inputs.test_id }} test
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          exit $(docker wait ${{ inputs.test_id }}) \
+          "
+
+
  # create a state image from the instance's state disk, if requested by the caller
  create-state-image:
    name: Create ${{ inputs.test_id }} cached state image
    runs-on: ubuntu-latest
+    needs: [ test-result ]
    # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
    # Normally, if a job is skipped, all the jobs that depend on it are also skipped.
    # So we need to override the default success() check to make this job run.
-    needs: [ test-without-cached-state, test-with-cached-state ]
    if: ${{ !cancelled() && !failure() && inputs.saves_to_disk }}
    permissions:
      contents: 'read'