feat(ssh): enable OS Login for GCP test instances (#5602)

* feat(ssh): enable OS Login for GCP test instances

* fix(ssh): force service account impersonation for OS Login

* debug: show actual user trying to impersonate SA

* fix(glcloud): configure gcloud before running commands

* fix(ssh): add VM zone to ssh command

* fix(auth): bringing changes from #5614

* fix(auth): impersonation is working as expected now

* fix(gcloud): setup the GCP CLI after authenticating (#5606)

Previous behavior:
`gcloud` commands have been running without an appropiate authentication
as the `auth` auction was sucessfully executed, but the actual gcloud
CLI being used in further jobs was not using the correct configuration
nor credentials

Expected behavior:
All `gcloud` commands should be properly configured and authenticated.

Solution:
Add the `google-github-actions/setup-gcloud` action after each
`google-github-actions/auth` invocation, and before running any `gcloud`
command.

Remove the need of an OAuth Access token when not required by following
steps

* fix(auth): revert to latest version

* fix: wrong replace

* fix(ci): use a specific debian image for VM containers

* fix(ssh): delete generated SSH keys by CI after 30 seconds

* debug: remove debug commands

* fix(compute): use a lightweight container image

* fix(ci): add missing sudo to docker command

* Update .github/workflows/deploy-gcp-tests.yml

Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>

* fix(ssh): delete ssh-keys for the specific GHA service account

Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>
This commit is contained in:
Gustavo Valverde 2022-11-16 10:27:09 -04:00 committed by GitHub
parent 04df10fc7d
commit 844ebf0dbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 49 additions and 53 deletions

View File

@ -145,11 +145,11 @@ jobs:
--boot-disk-size 300GB \
--boot-disk-type pd-ssd \
--create-disk name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
--container-image debian-11 \
--container-image gcr.io/google-containers/busybox \
--container-restart-policy=never \
--machine-type ${{ env.MACHINE_TYPE }} \
--scopes cloud-platform \
--metadata=google-monitoring-enabled=true,google-logging-enabled=true \
--metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE,enable-oslogin=TRUE \
--metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \
--tags ${{ inputs.app_name }} \
--zone ${{ env.ZONE }}
@ -160,10 +160,9 @@ jobs:
# SSH into the just created VM, and create a docker volume with the newly created disk.
- name: Create ${{ inputs.test_id }} Docker volume
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -211,10 +210,9 @@ jobs:
# Launch the test without any cached state
- name: Launch ${{ inputs.test_id }} test
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -366,11 +364,11 @@ jobs:
--boot-disk-size 300GB \
--boot-disk-type pd-ssd \
--create-disk image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
--container-image debian-11 \
--container-image gcr.io/google-containers/busybox \
--container-restart-policy=never \
--machine-type ${{ env.MACHINE_TYPE }} \
--scopes cloud-platform \
--metadata=google-monitoring-enabled=true,google-logging-enabled=true \
--metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE,enable-oslogin=TRUE \
--metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \
--tags ${{ inputs.app_name }} \
--zone ${{ env.ZONE }}
@ -383,10 +381,9 @@ jobs:
# but the cached state can be smaller if we just increased the disk size.)
- name: Create ${{ inputs.test_id }} Docker volume
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -452,10 +449,9 @@ jobs:
# TODO: we should find a better logic for this use cases
if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -502,10 +498,9 @@ jobs:
# TODO: we should find a better logic for this use cases
if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -568,10 +563,9 @@ jobs:
# Errors in the tests are caught by the final test status job.
- name: Show logs for ${{ inputs.test_id }} test (sprout)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -627,10 +621,9 @@ jobs:
# Show recent logs, following until Canopy activation (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (heartwood)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -683,10 +676,9 @@ jobs:
# Show recent logs, following until NU5 activation (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (canopy)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -741,10 +733,9 @@ jobs:
# Show recent logs, following until block 1,740,000 (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (1740k)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -801,10 +792,9 @@ jobs:
# Show recent logs, following until block 1,760,000 (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (1760k)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -861,10 +851,9 @@ jobs:
# Show recent logs, following until block 1,780,000 (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (1780k)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -922,10 +911,9 @@ jobs:
# Show recent logs, following until block 1,800,000 (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (1800k)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -982,10 +970,9 @@ jobs:
# Show recent logs, following until block 1,820,000 (or the test finishes)
- name: Show logs for ${{ inputs.test_id }} test (1820k)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -1041,10 +1028,9 @@ jobs:
# TODO: when doing obtain/extend tips, log the verifier in use, and check for full verification here
- name: Show logs for ${{ inputs.test_id }} test (checkpoint)
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -1111,10 +1097,9 @@ jobs:
# (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
- name: Result of ${{ inputs.test_id }} test
run: |
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
@ -1237,15 +1222,14 @@ jobs:
SYNC_HEIGHT=""
DOCKER_LOGS=$( \
gcloud compute ssh \
github-service-account@${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--ssh-key-expire-after=30s \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=" \
docker logs ${{ inputs.test_id }} --tail 200 \
sudo docker logs ${{ inputs.test_id }} --tail 200 \
")
SYNC_HEIGHT=$( \
@ -1376,3 +1360,15 @@ jobs:
else
gcloud compute instances delete "${INSTANCE}" --zone "${{ env.ZONE }}" --delete-disks all --quiet
fi
# Deletes SSH keys generated during this workflow run, as GCP has a limit of SSH keys
# that can exist at the same time in the OS Login metadata. Not deleting this keys
# could cause the following error:
# `Login profile size exceeds 32 KiB. Delete profile values to make additional space`
- name: Delete temporal SSH keys
continue-on-error: true
run: |
for i in $(gcloud compute os-login ssh-keys list --format="table[no-heading](value.fingerprint)") --impersonate-service-account=github-service-account@zealous-zebra.iam.gserviceaccount.com; do
echo "$i";
gcloud compute os-login ssh-keys remove --key "$i" --impersonate-service-account=github-service-account@zealous-zebra.iam.gserviceaccount.com || true;
done