diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 61344b93a..ecd9f10fc 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -145,11 +145,11 @@ jobs: --boot-disk-size 300GB \ --boot-disk-type pd-ssd \ --create-disk name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image debian-11 \ + --container-image gcr.io/google-containers/busybox \ --container-restart-policy=never \ --machine-type ${{ env.MACHINE_TYPE }} \ --scopes cloud-platform \ - --metadata=google-monitoring-enabled=true,google-logging-enabled=true \ + --metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE,enable-oslogin=TRUE \ --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \ --tags ${{ inputs.app_name }} \ --zone ${{ env.ZONE }} @@ -160,10 +160,9 @@ jobs: # SSH into the just created VM, and create a docker volume with the newly created disk. - name: Create ${{ inputs.test_id }} Docker volume run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -211,10 +210,9 @@ jobs: # Launch the test without any cached state - name: Launch ${{ inputs.test_id }} test run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -366,11 +364,11 @@ jobs: --boot-disk-size 300GB \ --boot-disk-type pd-ssd \ --create-disk image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image debian-11 \ + --container-image gcr.io/google-containers/busybox \ --container-restart-policy=never \ --machine-type ${{ env.MACHINE_TYPE }} \ --scopes cloud-platform \ - --metadata=google-monitoring-enabled=true,google-logging-enabled=true \ + --metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE,enable-oslogin=TRUE \ --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \ --tags ${{ inputs.app_name }} \ --zone ${{ env.ZONE }} @@ -383,10 +381,9 @@ jobs: # but the cached state can be smaller if we just increased the disk size.) - name: Create ${{ inputs.test_id }} Docker volume run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -452,10 +449,9 @@ jobs: # TODO: we should find a better logic for this use cases if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }} run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -502,10 +498,9 @@ jobs: # TODO: we should find a better logic for this use cases if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }} run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -568,10 +563,9 @@ jobs: # Errors in the tests are caught by the final test status job. - name: Show logs for ${{ inputs.test_id }} test (sprout) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -627,10 +621,9 @@ jobs: # Show recent logs, following until Canopy activation (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (heartwood) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -683,10 +676,9 @@ jobs: # Show recent logs, following until NU5 activation (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (canopy) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -741,10 +733,9 @@ jobs: # Show recent logs, following until block 1,740,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1740k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -801,10 +792,9 @@ jobs: # Show recent logs, following until block 1,760,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1760k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -861,10 +851,9 @@ jobs: # Show recent logs, following until block 1,780,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1780k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -922,10 +911,9 @@ jobs: # Show recent logs, following until block 1,800,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1800k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -982,10 +970,9 @@ jobs: # Show recent logs, following until block 1,820,000 (or the test finishes) - name: Show logs for ${{ inputs.test_id }} test (1820k) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -1041,10 +1028,9 @@ jobs: # TODO: when doing obtain/extend tips, log the verifier in use, and check for full verification here - name: Show logs for ${{ inputs.test_id }} test (checkpoint) run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -1111,10 +1097,9 @@ jobs: # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.) - name: Result of ${{ inputs.test_id }} test run: | - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ @@ -1237,15 +1222,14 @@ jobs: SYNC_HEIGHT="" DOCKER_LOGS=$( \ - gcloud compute ssh \ - github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --ssh-key-expire-after=30s \ --zone ${{ env.ZONE }} \ - --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ --command=" \ - docker logs ${{ inputs.test_id }} --tail 200 \ + sudo docker logs ${{ inputs.test_id }} --tail 200 \ ") SYNC_HEIGHT=$( \ @@ -1376,3 +1360,15 @@ jobs: else gcloud compute instances delete "${INSTANCE}" --zone "${{ env.ZONE }}" --delete-disks all --quiet fi + + # Deletes SSH keys generated during this workflow run, as GCP has a limit of SSH keys + # that can exist at the same time in the OS Login metadata. Not deleting this keys + # could cause the following error: + # `Login profile size exceeds 32 KiB. Delete profile values to make additional space` + - name: Delete temporal SSH keys + continue-on-error: true + run: | + for i in $(gcloud compute os-login ssh-keys list --format="table[no-heading](value.fingerprint)") --impersonate-service-account=github-service-account@zealous-zebra.iam.gserviceaccount.com; do + echo "$i"; + gcloud compute os-login ssh-keys remove --key "$i" --impersonate-service-account=github-service-account@zealous-zebra.iam.gserviceaccount.com || true; + done