zebra/.github/workflows/test-full-sync.yml

294 lines
12 KiB
YAML
Raw Normal View History

name: Full sync test
on:
workflow_dispatch:
inputs:
network:
default: 'Mainnet'
description: 'Network to deploy: Mainnet or Testnet'
required: true
checkpoint_sync:
default: 'true'
description: 'Configures `zebrad` to use as many checkpoints as possible'
required: true
pull_request:
branches:
- main
paths:
# code and tests (including full sync acceptance test changes)
# TODO: ignore changes in test code that isn't used in the full sync test
- '**/*.rs'
# hard-coded checkpoints
# TODO: ignore changes to proptest seed .txt files
- '**/*.txt'
# dependencies
- '**/Cargo.toml'
- '**/Cargo.lock'
# workflow definitions
- 'docker/**'
- '.github/workflows/test-full-sync.yml'
push:
branches:
- main
paths:
# code and tests (including full sync acceptance test changes)
# TODO: ignore changes in test code that isn't used in the full sync test
- '**/*.rs'
# hard-coded checkpoints
# TODO: ignore changes to proptest seed .txt files
- '**/*.txt'
# dependencies
- '**/Cargo.toml'
- '**/Cargo.lock'
# workflow definitions
- 'docker/**'
- '.github/workflows/test-full-sync.yml'
env:
CARGO_INCREMENTAL: '1'
ZEBRA_SKIP_IPV6_TESTS: '1'
RUST_BACKTRACE: full
RUST_LIB_BACKTRACE: full
COLORBT_SHOW_HIDDEN: '1'
NETWORK: Mainnet
PROJECT_ID: zealous-zebra
GAR_BASE: us-docker.pkg.dev/zealous-zebra/zebra
GCR_BASE: gcr.io/zealous-zebra
REGION: us-central1
ZONE: us-central1-a
MACHINE_TYPE: c2d-standard-16
IMAGE_NAME: zebrad-test
jobs:
build:
# TODO add `startsWith(github.head_ref, 'mergify/merge-queue/')` to the condition to
# only run on Mergify head branches, and on manual dispatch:
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-your-workflow-based-on-the-head-or-base-branch-of-a-pull-request-1
if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
name: Build images
timeout-minutes: 210
runs-on: ubuntu-latest
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v3.0.1
with:
persist-credentials: false
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4
with:
short-length: 7
# Automatic tag management and OCI Image Format Specification for labels
- name: Docker meta
id: meta
uses: docker/metadata-action@v3.7.0
with:
# list of Docker images to use as base name for tags
images: |
${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}
${{ env.GCR_BASE }}/${{ env.GITHUB_REPOSITORY_SLUG_URL }}/${{ env.IMAGE_NAME }}
# generate Docker tags based on the following events/attributes
tags: |
type=schedule
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=sha
# Setup Docker Buildx to allow use of docker cache layers from GH
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v0.7.0
with:
workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
token_format: 'access_token'
- name: Login to Google Artifact Registry
uses: docker/login-action@v1.14.1
with:
registry: us-docker.pkg.dev
username: oauth2accesstoken
password: ${{ steps.auth.outputs.access_token }}
- name: Login to Google Container Registry
uses: docker/login-action@v1.14.1
with:
registry: gcr.io
username: oauth2accesstoken
password: ${{ steps.auth.outputs.access_token }}
# Build and push image to Google Artifact Registry
- name: Build & push
id: docker_build
uses: docker/build-push-action@v2.10.0
with:
target: tester
context: .
file: ./docker/Dockerfile
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: |
NETWORK=${{ github.event.inputs.network || env.NETWORK }}
SHORT_SHA=${{ env.GITHUB_SHA_SHORT }}
RUST_BACKTRACE=${{ env.RUST_BACKTRACE }}
RUST_LIB_BACKTRACE=${{ env.RUST_LIB_BACKTRACE }}
COLORBT_SHOW_HIDDEN=${{ env.COLORBT_SHOW_HIDDEN }}
ZEBRA_SKIP_NETWORK_TESTS="1"
CHECKPOINT_SYNC=${{ github.event.inputs.checkpoint_sync || true }}
RUST_LOG=debug
SENTRY_DSN=${{ secrets.SENTRY_ENDPOINT }}
push: true
cache-from: type=registry,ref=${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:${{ env.GITHUB_REF_SLUG_URL }}-buildcache
cache-to: type=registry,ref=${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:${{ env.GITHUB_REF_SLUG_URL }}-buildcache,mode=max
# Test that Zebra can run a full mainnet sync after a PR is approved
test-full-sync:
name: Test full Mainnet sync
runs-on: ubuntu-latest
needs: [build]
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/checkout@v3.0.1
refactor(ci): use distinctive names for cached state disks (#4073) * fix(ci): correctly use lowered network caps In the Test workflow we were using a different approach than the one being used in the Full sync test. Also, in the Full sync test the variable was LOWER_NET_NAME, but NETWORK was being used in the disk name, with caps. * imp(ci): get state version from local constants.rs * imp(ci): use the same get name approach * fix(ci): use the correct name for state version variable * imp(ci)!: use different disk names for cached states Disk states synced to canopy and synced to the chain tip should have different names to reference correctly on actual and coming tests the needed disk. * imp(ci): test-stateful-sync no longer depends on regenerate-stateful-disks * Apply suggestions from code review Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com> * fix(ci): use a better name for network string conversion * Revert "Apply suggestions from code review" This reverts commit cbbfaf4e9c507c424db9ef3d9d31f6a52819b0bf. * fix: do not get log information if sync was skipped * fix(ci): do not lower the variable name * fix(ci): use the same lowering case for network everywhere * test: more .dockerignore conditions * fix: use the right approach to lower caps * remove extra .dockerignore * trigger a change for stateful disk regeneration * imp(ci): use `checkpoint` as the disk reference * revert wrong delete * fix(ci): add INSTANCE_ID and correct logging message * imp(ci): add `v` prefix to state version number * fix(ci): remove typo from logging message to get the height Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>
2022-04-11 22:34:15 -07:00
with:
persist-credentials: false
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4
with:
short-length: 7
- name: Downcase network name for disks
run: |
refactor(ci): use distinctive names for cached state disks (#4073) * fix(ci): correctly use lowered network caps In the Test workflow we were using a different approach than the one being used in the Full sync test. Also, in the Full sync test the variable was LOWER_NET_NAME, but NETWORK was being used in the disk name, with caps. * imp(ci): get state version from local constants.rs * imp(ci): use the same get name approach * fix(ci): use the correct name for state version variable * imp(ci)!: use different disk names for cached states Disk states synced to canopy and synced to the chain tip should have different names to reference correctly on actual and coming tests the needed disk. * imp(ci): test-stateful-sync no longer depends on regenerate-stateful-disks * Apply suggestions from code review Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com> * fix(ci): use a better name for network string conversion * Revert "Apply suggestions from code review" This reverts commit cbbfaf4e9c507c424db9ef3d9d31f6a52819b0bf. * fix: do not get log information if sync was skipped * fix(ci): do not lower the variable name * fix(ci): use the same lowering case for network everywhere * test: more .dockerignore conditions * fix: use the right approach to lower caps * remove extra .dockerignore * trigger a change for stateful disk regeneration * imp(ci): use `checkpoint` as the disk reference * revert wrong delete * fix(ci): add INSTANCE_ID and correct logging message * imp(ci): add `v` prefix to state version number * fix(ci): remove typo from logging message to get the height Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>
2022-04-11 22:34:15 -07:00
NETWORK_CAPS=${{ github.event.inputs.network || env.NETWORK }}
echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v0.7.0
with:
workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
token_format: 'access_token'
# Check if our destination compute instance exists and delete it
- name: Delete existing instance with same SHA
run: |
INSTANCE=$(gcloud compute instances list --filter=full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)')
if [ -z "${INSTANCE}" ]; then
echo "No instance to delete"
else
gcloud compute instances delete "${INSTANCE}" --zone "${{ env.ZONE }}" --delete-disks all --quiet
fi
# Creates Compute Engine virtual machine instance w/ disks
- name: Create GCP compute instance
run: |
gcloud compute instances create-with-container "full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
--boot-disk-size 100GB \
--boot-disk-type pd-ssd \
--container-image ${{ env.GAR_BASE }}/${{ env.IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
--container-restart-policy=never \
--container-stdin \
--container-tty \
--container-env=ZEBRA_SKIP_IPV6_TESTS=1,TEST_FULL_SYNC=1,ZEBRA_FORCE_USE_COLOR=1,FULL_SYNC_MAINNET_TIMEOUT_MINUTES=600 \
--machine-type ${{ env.MACHINE_TYPE }} \
--scopes cloud-platform \
--metadata=google-monitoring-enabled=true,google-logging-enabled=true \
--tags zebrad \
--zone "${{ env.ZONE }}"
# TODO: this approach is very messy, but getting the just created container name is very error prone and GCP doesn't have a workaround for this without requiring a TTY
# This TODO relates to the following issues:
# https://github.com/actions/runner/issues/241
# https://www.googlecloudcommunity.com/gc/Infrastructure-Compute-Storage/SSH-into-Compute-Container-not-easily-possible/td-p/170915
#
# Deploying a zebra container might take more than 30 seconds to completely start, so we're adding a timer at the end
# of this step before starting the following ones
- name: Get container name from logs
run: |
INSTANCE_ID=$(gcloud compute instances describe full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --zone ${{ env.ZONE }} --format='value(id)')
echo "INSTANCE_ID=$INSTANCE_ID" >> $GITHUB_ENV
CONTAINER_NAME=""
while [[ ${CONTAINER_NAME} != *"full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}"* ]]; do
CONTAINER_NAME=$(gcloud logging read 'log_name=projects/${{ env.PROJECT_ID }}/logs/cos_system AND jsonPayload.MESSAGE:full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}' --format='value(jsonPayload.MESSAGE)' --limit=1 | grep -o '...-full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-....' | tr -d "'.")
echo "Using container: ${CONTAINER_NAME} from instance: ${INSTANCE_ID}"
sleep 10
done
refactor(ci): use distinctive names for cached state disks (#4073) * fix(ci): correctly use lowered network caps In the Test workflow we were using a different approach than the one being used in the Full sync test. Also, in the Full sync test the variable was LOWER_NET_NAME, but NETWORK was being used in the disk name, with caps. * imp(ci): get state version from local constants.rs * imp(ci): use the same get name approach * fix(ci): use the correct name for state version variable * imp(ci)!: use different disk names for cached states Disk states synced to canopy and synced to the chain tip should have different names to reference correctly on actual and coming tests the needed disk. * imp(ci): test-stateful-sync no longer depends on regenerate-stateful-disks * Apply suggestions from code review Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com> * fix(ci): use a better name for network string conversion * Revert "Apply suggestions from code review" This reverts commit cbbfaf4e9c507c424db9ef3d9d31f6a52819b0bf. * fix: do not get log information if sync was skipped * fix(ci): do not lower the variable name * fix(ci): use the same lowering case for network everywhere * test: more .dockerignore conditions * fix: use the right approach to lower caps * remove extra .dockerignore * trigger a change for stateful disk regeneration * imp(ci): use `checkpoint` as the disk reference * revert wrong delete * fix(ci): add INSTANCE_ID and correct logging message * imp(ci): add `v` prefix to state version number * fix(ci): remove typo from logging message to get the height Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>
2022-04-11 22:34:15 -07:00
echo "INSTANCE_ID=$INSTANCE_ID" >> $GITHUB_ENV
echo "CONTAINER_NAME=$CONTAINER_NAME" >> $GITHUB_ENV
sleep 90
- name: Full sync
id: full-sync
run: |
gcloud compute ssh \
full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--command="docker logs --follow ${{ env.CONTAINER_NAME }}"
EXIT_CODE=$(\
gcloud compute ssh \
full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ env.ZONE }} \
--quiet \
--ssh-flag="-o ServerAliveInterval=5" \
--command="docker wait ${{ env.CONTAINER_NAME }}")
exit ${EXIT_CODE}
refactor(ci): use distinctive names for cached state disks (#4073) * fix(ci): correctly use lowered network caps In the Test workflow we were using a different approach than the one being used in the Full sync test. Also, in the Full sync test the variable was LOWER_NET_NAME, but NETWORK was being used in the disk name, with caps. * imp(ci): get state version from local constants.rs * imp(ci): use the same get name approach * fix(ci): use the correct name for state version variable * imp(ci)!: use different disk names for cached states Disk states synced to canopy and synced to the chain tip should have different names to reference correctly on actual and coming tests the needed disk. * imp(ci): test-stateful-sync no longer depends on regenerate-stateful-disks * Apply suggestions from code review Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com> * fix(ci): use a better name for network string conversion * Revert "Apply suggestions from code review" This reverts commit cbbfaf4e9c507c424db9ef3d9d31f6a52819b0bf. * fix: do not get log information if sync was skipped * fix(ci): do not lower the variable name * fix(ci): use the same lowering case for network everywhere * test: more .dockerignore conditions * fix: use the right approach to lower caps * remove extra .dockerignore * trigger a change for stateful disk regeneration * imp(ci): use `checkpoint` as the disk reference * revert wrong delete * fix(ci): add INSTANCE_ID and correct logging message * imp(ci): add `v` prefix to state version number * fix(ci): remove typo from logging message to get the height Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>
2022-04-11 22:34:15 -07:00
- name: Get state version from constants.rs
run: |
STATE_VERSION=""
LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" $GITHUB_WORKSPACE/zebra-state/src/constants.rs | grep -oE "[0-9]+" | tail -n1)
echo "STATE_VERSION: $LOCAL_STATE_VERSION"
echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> $GITHUB_ENV
- name: Get sync height from logs
run: |
SYNC_HEIGHT=""
while [[ ${SYNC_HEIGHT} == "" ]]; do
SYNC_HEIGHT=$(gcloud logging read --format='value(jsonPayload.MESSAGE)' --order="desc" --limit=1 '(resource.labels.instance_id="${{ env.INSTANCE_ID }}" AND jsonPayload.message=~".+finished initial sync to chain tip.+Height\([0-9]+\).+")' | grep -oE 'Height\([0-9]+\)' | grep -oE '[0-9]+' || [[ $? == 1 ]] )
echo "SYNC_HEIGHT: $SYNC_HEIGHT"
sleep 10
done
echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> $GITHUB_ENV
# Create image from disk
# Force the image creation as the disk is still attached, even though is not being used by the container
- name: Create image from state disk
run: |
gcloud compute images create zebrad-cache-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${{ env.NETWORK }}-tip \
--force \
--source-disk=full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--source-disk-zone=${{ env.ZONE }} \
--storage-location=us \
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}"
- name: Delete test instance
# Do not delete the instance if the sync timeouts in GitHub
if: ${{ steps.full-sync.outcome == 'success' || steps.full-sync.outcome == 'failure' }}
continue-on-error: true
run: |
gcloud compute instances delete "full-sync-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" --zone "${{ env.ZONE }}" --delete-disks all --quiet