From 1d861b0d20b79cb9d2e9f66e54cd0f0fd5860a50 Mon Sep 17 00:00:00 2001 From: teor Date: Sun, 28 Aug 2022 05:42:20 +1000 Subject: [PATCH] fix(ci): Increase full sync timeouts for longer syncs (#4961) * Increase full sync timeout to 24 hours Expected sync time is ~21 hours as of August 2022. * Split final checkpoint job into two smaller jobs to avoid timeouts Also make regexes easier to read. * Fix a job name typo --- .github/workflows/deploy-gcp-tests.yml | 86 +++++++++++++++++++++++--- zebrad/tests/common/sync.rs | 4 +- 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index f21739d23..e8da122c6 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -547,7 +547,12 @@ jobs: ${{ inputs.test_id }} | \ tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - '(estimated progress.*network_upgrade.*=.*Sapling)|(estimated progress.*network_upgrade.*=.*Blossom)|(estimated progress.*network_upgrade.*=.*Heartwood)|(estimated progress.*network_upgrade.*=.*Canopy)|(estimated progress.*network_upgrade.*=.*Nu5)|(test result:.*finished in)' \ + -e 'estimated progress.*network_upgrade.*=.*Sapling' \ + -e 'estimated progress.*network_upgrade.*=.*Blossom' \ + -e 'estimated progress.*network_upgrade.*=.*Heartwood' \ + -e 'estimated progress.*network_upgrade.*=.*Canopy' \ + -e 'estimated progress.*network_upgrade.*=.*Nu5' \ + -e 'test result:.*finished in' \ " # follow the logs of the test we just launched, up to Canopy activation (or the test finishing) @@ -602,7 +607,9 @@ jobs: ${{ inputs.test_id }} | \ tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - '(estimated progress.*network_upgrade.*=.*Canopy)|(estimated progress.*network_upgrade.*=.*Nu5)|(test result:.*finished in)' \ + -e 'estimated progress.*network_upgrade.*=.*Canopy' \ + -e 'estimated progress.*network_upgrade.*=.*Nu5' \ + -e 'test result:.*finished in' \ " # follow the logs of the test we just launched, up to NU5 activation (or the test finishing) @@ -657,14 +664,14 @@ jobs: ${{ inputs.test_id }} | \ tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - '(estimated progress.*network_upgrade.*=.*Nu5)|(test result:.*finished in)' \ + -e 'estimated progress.*network_upgrade.*=.*Nu5' \ + -e 'test result:.*finished in' \ " # follow the logs of the test we just launched, up to block 1,740,000 or later # (or the test finishing) # # We chose this height because it was about 5 hours into the NU5 sync, at the end of July 2022. - # This is a temporary workaround until we improve sync speeds. logs-1740k: name: Log ${{ inputs.test_id }} test (1740k) needs: [ logs-canopy ] @@ -716,13 +723,77 @@ jobs: ${{ inputs.test_id }} | \ tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - '(estimated progress.*current_height.*=.*17[4-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks)|(estimated progress.*current_height.*=.*1[8-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks)|(estimated progress.*current_height.*=.*2[0-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks)|(test result:.*finished in)' \ + -e 'estimated progress.*current_height.*=.*17[4-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \ + -e 'estimated progress.*current_height.*=.*1[8-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \ + -e 'estimated progress.*current_height.*=.*2[0-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \ + -e 'test result:.*finished in' \ + " + + # follow the logs of the test we just launched, up to block 1,760,000 or later + # (or the test finishing) + # + # We chose this height because it was about 9 hours into the NU5 sync, at the end of August 2022. + logs-1760k: + name: Log ${{ inputs.test_id }} test (1760k) + needs: [ logs-1740k ] + # If the previous job fails, we still want to show the logs. + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + permissions: + contents: 'read' + id-token: 'write' + steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + - name: Downcase network name for disks + run: | + NETWORK_CAPS=${{ inputs.network }} + echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.8.0 + with: + retries: '3' + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Show recent logs, following until block 1,760,000 (or the test finishes) + - name: Show logs for ${{ inputs.test_id }} test (1760k) + run: | + gcloud compute ssh \ + ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ env.ZONE }} \ + --quiet \ + --ssh-flag="-o ServerAliveInterval=5" \ + --command \ + "\ + docker logs \ + --tail all \ + --follow \ + ${{ inputs.test_id }} | \ + tee --output-error=exit /dev/stderr | \ + grep --max-count=1 --extended-regexp --color=always \ + -e 'estimated progress.*current_height.*=.*17[6-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \ + -e 'estimated progress.*current_height.*=.*1[8-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \ + -e 'estimated progress.*current_height.*=.*2[0-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \ + -e 'test result:.*finished in' \ " # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) logs-checkpoint: name: Log ${{ inputs.test_id }} test (checkpoint) - needs: [ logs-1740k ] + needs: [ logs-1760k ] # If the previous job fails, we still want to show the logs. if: ${{ !cancelled() }} runs-on: ubuntu-latest @@ -773,7 +844,8 @@ jobs: ${{ inputs.test_id }} | \ tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - '(verified final checkpoint)|(test result:.*finished in)' \ + -e 'verified final checkpoint' \ + -e 'test result:.*finished in' \ " # follow the logs of the test we just launched, until it finishes diff --git a/zebrad/tests/common/sync.rs b/zebrad/tests/common/sync.rs index a9a8d283b..494f79ddc 100644 --- a/zebrad/tests/common/sync.rs +++ b/zebrad/tests/common/sync.rs @@ -363,8 +363,8 @@ pub fn create_cached_database_height( ) -> Result<()> { eprintln!("creating cached database"); - // 20 hours - let timeout = Duration::from_secs(60 * 60 * 20); + // 24 hours + let timeout = Duration::from_secs(24 * 60 * 60); // Use a persistent state, so we can handle large syncs let mut config = cached_mandatory_checkpoint_test_config()?;