fix(ci): handle disk mounting and logs reading edge-cases (#7690)

* fix: use `exit-nopipe` with consistent `shell` usage Temporarily disabled the `set -e` option around the docker logs command to handle the broken pipe error gracefully. Handle more complex scenarios in our `Result of ${{ inputs.test_id }} test` job * fix: Use single quotes for the outer command * fix: use same approach for CD * test: check launch failure logs * fix: revert CD changes * fix: do not try to increase the disk size and wait mounting * fix: increase GB a bit more * fix: do not fail on pipe failure * fix: use plain `tee /dev/stderr` If this does not work try `(tee … || true)` * fix: `tee` not stoping on cd config tests * fix: match logic with GCP tests * fix(cd): handle pipe and other errors correctly * try `tee --output-error=exit-nopipe` * fix: TRAP without pipefail * test: pipefail with exit and trap * fix: use a subshell * fix(ci): wait for mounting and show system logs if fail * fix(ci): GCP is not always mounting disks in the same order * fix: use `grep` instead of `awk` * fix: typo * fix: use simpler `grep` command * fix: do not sleep if not require * chore: reduce diff
2023-10-09 18:59:59 +01:00 · 2023-10-09 18:59:59 +01:00 · 8d0a17ee1c
parent a2b7859e8e
commit 8d0a17ee1c
2 changed files with 162 additions and 81 deletions
--- a/.github/workflows/continous-delivery.yml
+++ b/.github/workflows/continous-delivery.yml
@ -29,7 +29,7 @@ on:
        type: boolean
        default: false

-  # Temporarily disabled to reduce network load, see #6894.
+  # TODO: Temporarily disabled to reduce network load, see #6894.
  #push:
  #  branches:
  #    - main
@ -132,29 +132,37 @@ jobs:

      # Make sure Zebra can sync at least one full checkpoint on mainnet
      - name: Run tests using the default config
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
-          set -ex
          docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
          docker run --detach --name default-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
-          # show the logs, even if the job times out
-          docker logs --tail all --follow default-conf-tests | \
-          tee --output-error=exit /dev/stderr | \
-          grep --max-count=1 --extended-regexp --color=always \
-          'net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter'
+
+          # Use a subshell to handle the broken pipe error gracefully
+          (
+            trap "" PIPE;
+            docker logs \
+            --tail all \
+            --follow \
+            default-conf-tests | \
+            tee --output-error=exit /dev/stderr | \
+            grep --max-count=1 --extended-regexp --color=always \
+            -e "net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter"
+          ) || true
+          LOGS_EXIT_STATUS=$?
+
          docker stop default-conf-tests
-          # get the exit status from docker
-          EXIT_STATUS=$( \
-          docker wait default-conf-tests || \
-          docker inspect --format "{{.State.ExitCode}}" default-conf-tests || \
-          echo "missing container, or missing exit status for container" \
-          )
-          docker logs default-conf-tests
-          echo "docker exit status: $EXIT_STATUS"
-          if [[ "$EXIT_STATUS" = "137" ]]; then
-          echo "ignoring expected signal status"
-          exit 0
+
+          EXIT_STATUS=$(docker wait default-conf-tests || echo "Error retrieving exit status");
+          echo "docker exit status: $EXIT_STATUS";
+
+          # If grep found the pattern, exit with the Docker container exit status
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
+              exit $EXIT_STATUS;
          fi
-          exit "$EXIT_STATUS"
+
+          # Handle other potential errors here
+          echo "An error occurred while processing the logs.";
+          exit 1;

  # Test reconfiguring the docker image for testnet.
  test-configuration-file-testnet:
@ -172,30 +180,37 @@ jobs:

      # Make sure Zebra can sync the genesis block on testnet
      - name: Run tests using a testnet config
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
-          set -ex
          docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
          docker run --env "NETWORK=Testnet" --detach --name testnet-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
-          # show the logs, even if the job times out
-          docker logs --tail all --follow testnet-conf-tests | \
-          tee --output-error=exit /dev/stderr | \
-          grep --max-count=1 --extended-regexp --color=always \
-          -e 'net.*=.*Test.*estimated progress to chain tip.*Genesis' \
-          -e 'net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter'
+          # Use a subshell to handle the broken pipe error gracefully
+          (
+            trap "" PIPE;
+            docker logs \
+            --tail all \
+            --follow \
+            testnet-conf-tests | \
+            tee --output-error=exit /dev/stderr | \
+            grep --max-count=1 --extended-regexp --color=always \
+            -e "net.*=.*Test.*estimated progress to chain tip.*Genesis" \
+            -e "net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter";
+          ) || true
+          LOGS_EXIT_STATUS=$?
+
          docker stop testnet-conf-tests
-          # get the exit status from docker
-          EXIT_STATUS=$( \
-          docker wait testnet-conf-tests || \
-          docker inspect --format "{{.State.ExitCode}}" testnet-conf-tests || \
-          echo "missing container, or missing exit status for container" \
-          )
-          docker logs testnet-conf-tests
-          echo "docker exit status: $EXIT_STATUS"
-          if [[ "$EXIT_STATUS" = "137" ]]; then
-          echo "ignoring expected signal status"
-          exit 0
+
+          EXIT_STATUS=$(docker wait testnet-conf-tests || echo "Error retrieving exit status");
+          echo "docker exit status: $EXIT_STATUS";
+
+          # If grep found the pattern, exit with the Docker container exit status
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
+              exit $EXIT_STATUS;
          fi
-          exit "$EXIT_STATUS"
+
+          # Handle other potential errors here
+          echo "An error occurred while processing the logs.";
+          exit 1;

  # Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet,
  # with one node in the configured GCP region.
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@ -183,39 +183,56 @@ jobs:

      # Format the mounted disk if the test doesn't use a cached state.
      - name: Format ${{ inputs.test_id }} volume
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
-          while sudo lsof /dev/sdb; do \
-            echo 'Waiting for /dev/sdb to be free...'; \
-            sleep 10; \
-          done; \
-          sudo mkfs.ext4 -v /dev/sdb \
-          "
+          --command=' \
+          set -ex;
+          # Extract the correct disk name based on the device-name
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
+          sudo mkfs.ext4 -v /dev/$DISK_NAME \
+          '

      # Launch the test without any cached state
      - name: Launch ${{ inputs.test_id }} test
+        id: launch-test
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
+
+      # Show debug logs if previous job failed
+      - name: Show debug logs if previous job failed
+        if: ${{ failure() }}
+        shell: /usr/bin/bash -exo pipefail {0}
+        run: |
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=' \
+          lsblk;
+          sudo lsof /dev/sdb;
+          sudo dmesg;
+          sudo journalctl -b \
+          '

  # set up and launch the test, if it uses cached state
  # each test runs one of the *-with/without-cached-state job series, and skips the other
@ -381,7 +398,6 @@ jobs:
          --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
          --tags ${{ inputs.app_name }} \
          --zone ${{ vars.GCP_ZONE }}
-          sleep 60

      # Launch the test with the previously created Zebra-only cached state.
      # Each test runs one of the "Launch test" steps, and skips the other.
@ -405,22 +421,43 @@ jobs:
        # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
        # TODO: we should find a better logic for this use cases
        if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
+          set -ex;
+          # Extract the correct disk name based on the device-name
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
+
          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
+
+      # Show debug logs if previous job failed
+      - name: Show debug logs if previous job failed
+        if: ${{ failure() && (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
+        run: |
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=' \
+          lsblk;
+          sudo lsof /dev/$DISK_NAME;
+          sudo dmesg;
+          sudo journalctl -b \
+          '

      # Launch the test with the previously created Lightwalletd and Zebra cached state.
      # Each test runs one of the "Launch test" steps, and skips the other.
@ -455,23 +492,44 @@ jobs:
        # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
        # TODO: we should find a better logic for this use cases
        if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
+          set -ex;
+          # Extract the correct disk name based on the device-name
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
+
          sudo docker run \
          --name ${{ inputs.test_id }} \
          --tty \
          --detach \
          ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
          ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
+
+      # Show debug logs if previous job failed
+      - name: Show debug logs if previous job failed
+        if: ${{ failure() && (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
+        run: |
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=' \
+          lsblk;
+          sudo lsof /dev/$DISK_NAME;
+          sudo dmesg;
+          sudo journalctl -b \
+          '

  # Show all the test logs, then follow the logs of the test we just launched, until it finishes.
  # Then check the result of the test.
@ -538,23 +596,23 @@ jobs:
      #
      # Errors in the tests are caught by the final test status job.
      - name: Check startup logs for ${{ inputs.test_id }}
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
          sudo docker logs \
          --tail all \
          --follow \
          ${{ inputs.test_id }} | \
          head -700 | \
-          tee --output-error=exit /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
          grep --max-count=1 --extended-regexp --color=always \
-          -e 'Zcash network: ${{ inputs.network }}' \
-          "
+          -e "Zcash network: ${{ inputs.network }}" \
+          '

      # Check that the container executed at least 1 Rust test harness test, and that all tests passed.
      # Then wait for the container to finish, and exit with the test's exit status.
@ -567,6 +625,7 @@ jobs:
      # with that status.
      # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
      - name: Result of ${{ inputs.test_id }} test
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
          --zone ${{ vars.GCP_ZONE }} \
@ -574,26 +633,31 @@ jobs:
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
          --command=' \
-          set -e;
-          set -o pipefail;
-          trap '' PIPE;
+          trap "" PIPE;

+          # Temporarily disable "set -e" to handle the broken pipe error gracefully
+          set +e;
          sudo docker logs \
          --tail all \
          --follow \
          ${{ inputs.test_id }} | \
-          tee --output-error=exit /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
          grep --max-count=1 --extended-regexp --color=always \
-          "test result: .*ok.* [1-9][0-9]* passed.*finished in"; \
+          "test result: .*ok.* [1-9][0-9]* passed.*finished in";
+          LOGS_EXIT_STATUS=$?;
+          set -e;

-          EXIT_STATUS=$( \
-          sudo docker wait ${{ inputs.test_id }} || \
-          sudo docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \
-          echo "missing container, or missing exit status for container" \
-          ); \
+          EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status");
+          echo "sudo docker exit status: $EXIT_STATUS";

-          echo "sudo docker exit status: $EXIT_STATUS"; \
-          exit "$EXIT_STATUS" \
+          # If grep found the pattern, exit with the Docker container"s exit status
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
+              exit $EXIT_STATUS;
+          fi
+
+          # Handle other potential errors here
+          echo "An error occurred while processing the logs.";
+          exit 1; \
          '

  # create a state image from the instance's state disk, if requested by the caller
@ -707,6 +771,7 @@ jobs:
      # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
      # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
      - name: Get database versions from logs
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          INITIAL_DISK_DB_VERSION=""
          RUNNING_DB_VERSION=""
@ -718,9 +783,9 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command=" \
+          --command=' \
          sudo docker logs ${{ inputs.test_id }} | head -1000 \
-          ")
+          ')

          # either a semantic version or "creating new database"
          INITIAL_DISK_DB_VERSION=$( \
@ -796,6 +861,7 @@ jobs:
      #
      # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
      - name: Get sync height from logs
+        shell: /usr/bin/bash -exo pipefail {0}
        run: |
          SYNC_HEIGHT=""

@ -805,9 +871,9 @@ jobs:
          --ssh-flag="-o ServerAliveInterval=5" \
          --ssh-flag="-o ConnectionAttempts=20" \
          --ssh-flag="-o ConnectTimeout=5" \
-          --command=" \
+          --command=' \
          sudo docker logs ${{ inputs.test_id }} --tail 200 \
-          ")
+          ')

          SYNC_HEIGHT=$( \
          echo "$DOCKER_LOGS" | \