From 1d861b0d20b79cb9d2e9f66e54cd0f0fd5860a50 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Sun, 28 Aug 2022 05:42:20 +1000
Subject: [PATCH] fix(ci): Increase full sync timeouts for longer syncs (#4961)

* Increase full sync timeout to 24 hours

Expected sync time is ~21 hours as of August 2022.

* Split final checkpoint job into two smaller jobs to avoid timeouts

Also make regexes easier to read.

* Fix a job name typo
---
 .github/workflows/deploy-gcp-tests.yml | 86 +++++++++++++++++++++++---
 zebrad/tests/common/sync.rs            |  4 +-
 2 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index f21739d23..e8da122c6 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -547,7 +547,12 @@ jobs:
           ${{ inputs.test_id }} | \
           tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          '(estimated progress.*network_upgrade.*=.*Sapling)|(estimated progress.*network_upgrade.*=.*Blossom)|(estimated progress.*network_upgrade.*=.*Heartwood)|(estimated progress.*network_upgrade.*=.*Canopy)|(estimated progress.*network_upgrade.*=.*Nu5)|(test result:.*finished in)' \
+          -e 'estimated progress.*network_upgrade.*=.*Sapling' \
+          -e 'estimated progress.*network_upgrade.*=.*Blossom' \
+          -e 'estimated progress.*network_upgrade.*=.*Heartwood' \
+          -e 'estimated progress.*network_upgrade.*=.*Canopy' \
+          -e 'estimated progress.*network_upgrade.*=.*Nu5' \
+          -e 'test result:.*finished in' \
           "
 
   # follow the logs of the test we just launched, up to Canopy activation (or the test finishing)
@@ -602,7 +607,9 @@ jobs:
           ${{ inputs.test_id }} | \
           tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          '(estimated progress.*network_upgrade.*=.*Canopy)|(estimated progress.*network_upgrade.*=.*Nu5)|(test result:.*finished in)' \
+          -e 'estimated progress.*network_upgrade.*=.*Canopy' \
+          -e 'estimated progress.*network_upgrade.*=.*Nu5' \
+          -e 'test result:.*finished in' \
           "
 
   # follow the logs of the test we just launched, up to NU5 activation (or the test finishing)
@@ -657,14 +664,14 @@ jobs:
           ${{ inputs.test_id }} | \
           tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          '(estimated progress.*network_upgrade.*=.*Nu5)|(test result:.*finished in)' \
+          -e 'estimated progress.*network_upgrade.*=.*Nu5' \
+          -e 'test result:.*finished in' \
           "
 
   # follow the logs of the test we just launched, up to block 1,740,000 or later
   # (or the test finishing)
   #
   # We chose this height because it was about 5 hours into the NU5 sync, at the end of July 2022.
-  # This is a temporary workaround until we improve sync speeds.
   logs-1740k:
     name: Log ${{ inputs.test_id }} test (1740k)
     needs: [ logs-canopy ]
@@ -716,13 +723,77 @@ jobs:
           ${{ inputs.test_id }} | \
           tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          '(estimated progress.*current_height.*=.*17[4-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks)|(estimated progress.*current_height.*=.*1[8-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks)|(estimated progress.*current_height.*=.*2[0-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks)|(test result:.*finished in)' \
+          -e 'estimated progress.*current_height.*=.*17[4-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \
+          -e 'estimated progress.*current_height.*=.*1[8-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \
+          -e 'estimated progress.*current_height.*=.*2[0-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \
+          -e 'test result:.*finished in' \
+          "
+
+  # follow the logs of the test we just launched, up to block 1,760,000 or later
+  # (or the test finishing)
+  #
+  # We chose this height because it was about 9 hours into the NU5 sync, at the end of August 2022.
+  logs-1760k:
+    name: Log ${{ inputs.test_id }} test (1760k)
+    needs: [ logs-1740k ]
+    # If the previous job fails, we still want to show the logs.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          retries: '3'
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Show recent logs, following until block 1,760,000 (or the test finishes)
+      - name: Show logs for ${{ inputs.test_id }} test (1760k)
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          docker logs \
+          --tail all \
+          --follow \
+          ${{ inputs.test_id }} | \
+          tee --output-error=exit /dev/stderr | \
+          grep --max-count=1 --extended-regexp --color=always \
+          -e 'estimated progress.*current_height.*=.*17[6-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \
+          -e 'estimated progress.*current_height.*=.*1[8-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \
+          -e 'estimated progress.*current_height.*=.*2[0-9][0-9][0-9][0-9][0-9][0-9].*remaining_sync_blocks' \
+          -e 'test result:.*finished in' \
           "
 
   # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
   logs-checkpoint:
     name: Log ${{ inputs.test_id }} test (checkpoint)
-    needs: [ logs-1740k ]
+    needs: [ logs-1760k ]
     # If the previous job fails, we still want to show the logs.
     if: ${{ !cancelled() }}
     runs-on: ubuntu-latest
@@ -773,7 +844,8 @@ jobs:
           ${{ inputs.test_id }} | \
           tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          '(verified final checkpoint)|(test result:.*finished in)' \
+          -e 'verified final checkpoint' \
+          -e 'test result:.*finished in' \
           "
 
   # follow the logs of the test we just launched, until it finishes
diff --git a/zebrad/tests/common/sync.rs b/zebrad/tests/common/sync.rs
index a9a8d283b..494f79ddc 100644
--- a/zebrad/tests/common/sync.rs
+++ b/zebrad/tests/common/sync.rs
@@ -363,8 +363,8 @@ pub fn create_cached_database_height(
 ) -> Result<()> {
     eprintln!("creating cached database");
 
-    // 20 hours
-    let timeout = Duration::from_secs(60 * 60 * 20);
+    // 24 hours
+    let timeout = Duration::from_secs(24 * 60 * 60);
 
     // Use a persistent state, so we can handle large syncs
     let mut config = cached_mandatory_checkpoint_test_config()?;