Merge branch 'main' into docker-refactor

2024-10-18 11:09:26 +02:00 · 2024-10-18 11:09:26 +02:00 · afeb05f50b
parent 4fa064c646 a182ffe098
commit afeb05f50b
7 changed files with 142 additions and 103 deletions
--- a/.github/workflows/cd-deploy-nodes-gcp.yml
+++ b/.github/workflows/cd-deploy-nodes-gcp.yml
@ -1,6 +1,6 @@
 # Google Cloud node deployments and tests that run when Rust code or dependencies are modified,
 # but only on PRs from the ZcashFoundation/zebra repository.
-# (External PRs are tested/deployed by mergify.) 
+# (External PRs are tested/deployed by mergify.)
 #
 # 1. `versioning`: Extracts the major version from the release semver. Useful for segregating instances based on major versions.
 # 2. `build`: Builds a Docker image named `zebrad` with the necessary tags derived from Git.
@ -30,58 +30,80 @@ on:
  workflow_dispatch:
    inputs:
      network:
-        default: 'Mainnet'
-        description: 'Network to deploy: Mainnet or Testnet'
+        default: Mainnet
+        description: "Network to deploy: Mainnet or Testnet"
        required: true
-      log_file:
-        default: ''
-        description: 'Log to a file path rather than standard output'
+        type: choice
+        options:
+          - Mainnet
+          - Testnet
+      cached_disk_type:
+        default: tip
+        description: "Type of cached disk to use"
+        required: true
+        type: choice
+        options:
+          - tip
+          - checkpoint
+      prefer_main_cached_state:
+        default: false
+        description: "Prefer cached state from the main branch"
+        required: false
+        type: boolean
+      no_cached_disk:
+        default: false
+        description: "Do not use a cached state disk"
+        required: false
+        type: boolean
      no_cache:
-        description: 'Disable the Docker cache for this build'
+        description: "Disable the Docker cache for this build"
        required: false
        type: boolean
        default: false
+      log_file:
+        default: ""
+        description: "Log to a file path rather than standard output"

  push:
-   # Skip main branch updates where Rust code and dependencies aren't modified.
-   branches:
-     - main
-   paths:
-     # code and tests
-     - '**/*.rs'
-     # hard-coded checkpoints and proptest regressions
-     - '**/*.txt'
-     # dependencies
-     - '**/Cargo.toml'
-     - '**/Cargo.lock'
-     # configuration files
-     - '.cargo/config.toml'
-     - '**/clippy.toml'
-     # workflow definitions
-     - 'docker/**'
-     - '.dockerignore'
-     - '.github/workflows/cd-deploy-nodes-gcp.yml'
-     - '.github/workflows/sub-build-docker-image.yml'
+    # Skip main branch updates where Rust code and dependencies aren't modified.
+    branches:
+      - main
+    paths:
+      # code and tests
+      - "**/*.rs"
+      # hard-coded checkpoints and proptest regressions
+      - "**/*.txt"
+      # dependencies
+      - "**/Cargo.toml"
+      - "**/Cargo.lock"
+      # configuration files
+      - ".cargo/config.toml"
+      - "**/clippy.toml"
+      # workflow definitions
+      - "docker/**"
+      - ".dockerignore"
+      - ".github/workflows/cd-deploy-nodes-gcp.yml"
+      - ".github/workflows/sub-build-docker-image.yml"

  # Only runs the Docker image tests, doesn't deploy any instances
  pull_request:
    # Skip PRs where Rust code and dependencies aren't modified.
    paths:
      # code and tests
-      - '**/*.rs'
+      - "**/*.rs"
      # hard-coded checkpoints and proptest regressions
-      - '**/*.txt'
+      - "**/*.txt"
      # dependencies
-      - '**/Cargo.toml'
-      - '**/Cargo.lock'
+      - "**/Cargo.toml"
+      - "**/Cargo.lock"
      # configuration files
-      - '.cargo/config.toml'
-      - '**/clippy.toml'
+      - ".cargo/config.toml"
+      - "**/clippy.toml"
      # workflow definitions
-      - 'docker/**'
-      - '.dockerignore'
-      - '.github/workflows/cd-deploy-nodes-gcp.yml'
-      - '.github/workflows/sub-build-docker-image.yml'
+      - "docker/**"
+      - ".dockerignore"
+      - ".github/workflows/cd-deploy-nodes-gcp.yml"
+      - ".github/workflows/sub-build-docker-image.yml"

  release:
    types:
@ -144,11 +166,11 @@ jobs:
    needs: build
    uses: ./.github/workflows/sub-test-zebra-config.yml
    with:
-      test_id: 'default-conf'
+      test_id: "default-conf"
      docker_image: ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
      grep_patterns: '-e "net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter"'
-      test_variables: '-e NETWORK'
-      network: 'Mainnet'
+      test_variables: "-e NETWORK"
+      network: "Mainnet"

  # Test reconfiguring the docker image for testnet.
  test-configuration-file-testnet:
@ -157,11 +179,11 @@ jobs:
    # Make sure Zebra can sync the genesis block on testnet
    uses: ./.github/workflows/sub-test-zebra-config.yml
    with:
-      test_id: 'testnet-conf'
+      test_id: "testnet-conf"
      docker_image: ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
      grep_patterns: '-e "net.*=.*Test.*estimated progress to chain tip.*Genesis" -e "net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter"'
-      test_variables: '-e NETWORK'
-      network: 'Testnet'
+      test_variables: "-e NETWORK"
+      network: "Testnet"

  # Finds a `tip` cached state disk for zebra from the main branch
  #
@ -170,11 +192,12 @@ jobs:
  get-disk-name:
    name: Get disk name
    uses: ./.github/workflows/sub-find-cached-disks.yml
+    if: ${{ !inputs.no_cached_disk }}
    with:
      network: ${{ inputs.network || vars.ZCASH_NETWORK }}
      disk_prefix: zebrad-cache
-      disk_suffix: tip
-      prefer_main_cached_state: true
+      disk_suffix: ${{ inputs.cached_disk_type || 'tip' }}
+      prefer_main_cached_state: ${{ inputs.prefer_main_cached_state || (github.event_name == 'push' && github.ref_name == 'main' && true) || false }}

  # Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet,
  # with one node in the configured GCP region.
@ -196,14 +219,21 @@ jobs:
      matrix:
        network: [Mainnet, Testnet]
    name: Deploy ${{ matrix.network }} nodes
-    needs: [ build, versioning, test-configuration-file, test-zebra-conf-path, get-disk-name ]
+    needs:
+      [
+        build,
+        versioning,
+        test-configuration-file,
+        test-zebra-conf-path,
+        get-disk-name,
+      ]
    runs-on: ubuntu-latest
    timeout-minutes: 60
    env:
      CACHED_DISK_NAME: ${{ needs.get-disk-name.outputs.cached_disk_name }}
    permissions:
-      contents: 'read'
-      id-token: 'write'
+      contents: "read"
+      id-token: "write"
    if: ${{ !cancelled() && !failure() && ((github.event_name == 'push' && github.ref_name == 'main') || github.event_name == 'release') }}

    steps:
@ -232,20 +262,20 @@ jobs:
        id: auth
        uses: google-github-actions/auth@v2.1.6
        with:
-          workload_identity_provider: '${{ vars.GCP_WIF }}'
-          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
+          workload_identity_provider: "${{ vars.GCP_WIF }}"
+          service_account: "${{ vars.GCP_DEPLOYMENTS_SA }}"

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.1

-      # TODO we should implement the fixes from https://github.com/ZcashFoundation/zebra/pull/5670 here
-      # but the implementation is failing as it's requiring the disk names, contrary to what is stated in the official documentation
      - name: Create instance template for ${{ matrix.network }}
        run: |
-          NAME="zebrad-cache-${{ env.GITHUB_HEAD_REF_SLUG_URL || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
-          DISK_PARAMS="name=${NAME},device-name=${NAME},size=400GB,type=pd-ssd"
+          DISK_NAME="zebrad-cache-${{ env.GITHUB_HEAD_REF_SLUG_URL || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
+          DISK_PARAMS="name=${DISK_NAME},device-name=${DISK_NAME},size=400GB,type=pd-ssd"
          if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
            DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
+          elif [ ${{ inputs.no_cached_disk && github.event_name == 'workflow_dispatch' }} ]; then
+           echo "No cached disk required"
          else
            echo "No cached disk found for ${{ matrix.network }} in main branch"
            exit 1
@ -258,7 +288,7 @@ jobs:
          --image-family=cos-stable \
          --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
          --create-disk="${DISK_PARAMS}" \
-          --container-mount-disk=mount-path='/var/cache/zebrad-cache',name=${NAME},mode=rw \
+          --container-mount-disk=mount-path='/var/cache/zebrad-cache',name=${DISK_NAME},mode=rw \
          --container-stdin \
          --container-tty \
          --container-image ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} \
@ -306,15 +336,16 @@ jobs:
  # Note: this instances are not automatically replaced or deleted
  deploy-instance:
    name: Deploy single ${{ inputs.network }} instance
-    needs: [ build, test-configuration-file, test-zebra-conf-path, get-disk-name ]
+    needs: [build, test-configuration-file, test-zebra-conf-path, get-disk-name]
    runs-on: ubuntu-latest
    timeout-minutes: 30
    env:
      CACHED_DISK_NAME: ${{ needs.get-disk-name.outputs.cached_disk_name }}
    permissions:
-      contents: 'read'
-      id-token: 'write'
-    if: github.event_name == 'workflow_dispatch'
+      contents: "read"
+      id-token: "write"
+    # Run even if we don't need a cached disk, but only when triggered by a workflow_dispatch
+    if: ${{ !failure() && github.event_name == 'workflow_dispatch' }}

    steps:
      - uses: actions/checkout@v4.2.1
@ -342,8 +373,8 @@ jobs:
        id: auth
        uses: google-github-actions/auth@v2.1.6
        with:
-          workload_identity_provider: '${{ vars.GCP_WIF }}'
-          service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
+          workload_identity_provider: "${{ vars.GCP_WIF }}"
+          service_account: "${{ vars.GCP_DEPLOYMENTS_SA }}"

      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.1
@ -351,10 +382,12 @@ jobs:
      # Create instance template from container image
      - name: Manual deploy of a single ${{ inputs.network }} instance running zebrad
        run: |
-          NAME="zebrad-cache-${{ env.GITHUB_HEAD_REF_SLUG_URL || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
-          DISK_PARAMS="name=${NAME},device-name=${NAME},size=400GB,type=pd-ssd"
+          DISK_NAME="zebrad-cache-${{ env.GITHUB_HEAD_REF_SLUG_URL || env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
+          DISK_PARAMS="name=${DISK_NAME},device-name=${DISK_NAME},size=400GB,type=pd-ssd"
          if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
            DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
+          elif [ ${{ inputs.no_cached_disk && github.event_name == 'workflow_dispatch' }} ]; then
+           echo "No cached disk required"
          else
            echo "No cached disk found for ${{ matrix.network }} in main branch"
            exit 1
@ -367,7 +400,7 @@ jobs:
          --image-family=cos-stable \
          --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
          --create-disk="${DISK_PARAMS}" \
-          --container-mount-disk=mount-path='/var/cache/zebrad-cache',name=${NAME},mode=rw \
+          --container-mount-disk=mount-path='/var/cache/zebrad-cache',name=${DISK_NAME},mode=rw \
          --container-stdin \
          --container-tty \
          --container-image ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} \
@ -382,7 +415,7 @@ jobs:
  failure-issue:
    name: Open or update issues for release failures
    # When a new job is added to this workflow, add it to this list.
-    needs: [ versioning, build, deploy-nodes, deploy-instance ]
+    needs: [versioning, build, deploy-nodes, deploy-instance]
    # Only open tickets for failed or cancelled jobs that are not coming from PRs.
    # (PR statuses are already reported in the PR jobs list, and checked by Mergify.)
    if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
--- a/.github/workflows/ci-lint.yml
+++ b/.github/workflows/ci-lint.yml
@ -93,7 +93,7 @@ jobs:
        run: |
          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=stable --profile=default

-      - uses: Swatinem/rust-cache@v2.7.3
+      - uses: Swatinem/rust-cache@v2.7.5
        with:
          shared-key: "clippy-cargo-lock"

@ -138,7 +138,7 @@ jobs:

      # We don't cache `fmt` outputs because the job is quick,
      # and we want to use the limited GitHub actions cache space for slower jobs.
-      #- uses: Swatinem/rust-cache@v2.7.3
+      #- uses: Swatinem/rust-cache@v2.7.5

      - run: |
          cargo fmt --all -- --check
--- a/.github/workflows/ci-unit-tests-os.yml
+++ b/.github/workflows/ci-unit-tests-os.yml
@ -112,7 +112,7 @@ jobs:
          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=${{ matrix.rust }} --profile=minimal


-      - uses: Swatinem/rust-cache@v2.7.3
+      - uses: Swatinem/rust-cache@v2.7.5
        # TODO: change Rust cache target directory on Windows,
        #       or remove this workaround once the build is more efficient (#3005).
        #with:
@ -221,7 +221,7 @@ jobs:
        run: |
          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=stable --profile=minimal

-      - uses: Swatinem/rust-cache@v2.7.3
+      - uses: Swatinem/rust-cache@v2.7.5
        with:
          shared-key: "clippy-cargo-lock"

--- a/.github/workflows/docs-deploy-firebase.yml
+++ b/.github/workflows/docs-deploy-firebase.yml
@ -155,7 +155,7 @@ jobs:
        run: |
          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=beta --profile=default

-      - uses: Swatinem/rust-cache@v2.7.3
+      - uses: Swatinem/rust-cache@v2.7.5

      - name: Build internal docs
        run: |
--- a/.github/workflows/scripts/gcp-get-cached-disks.sh
+++ b/.github/workflows/scripts/gcp-get-cached-disks.sh
@ -3,9 +3,9 @@
 # This script finds a cached Google Cloud Compute image based on specific criteria.
 #
 # If there are multiple disks:
-# - prefer images generated from the same commit, then
-# - if prefer_main_cached_state is true, prefer images from the `main` branch, then
-# - use any images from any other branch or commit.
+# - if `PREFER_MAIN_CACHED_STATE` is "true", then select an image from the `main` branch, else
+# - try to find a cached disk image from the current branch (or PR), else
+# - try to find an image from any branch.
 #
 # Within each of these categories:
 # - prefer newer images to older images
@ -20,7 +20,7 @@ echo "Extracting local state version..."
 LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "${GITHUB_WORKSPACE}/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1)
 echo "STATE_VERSION: ${LOCAL_STATE_VERSION}"

-# Function to find a cached disk image based on the git pattern (commit, main, or any branch)
+# Function to find a cached disk image based on the git pattern (branch, main, or any branch)
 find_cached_disk_image() {
    local git_pattern="${1}"
    local git_source="${2}"
@ -34,40 +34,36 @@ find_cached_disk_image() {
        echo "Found ${git_source} Disk: ${disk_name}" >&2
        disk_description=$(gcloud compute images describe "${disk_name}" --format="value(DESCRIPTION)")
        echo "Description: ${disk_description}" >&2
-        echo "${disk_name}"  # This is the actual return value when a disk is found
+        echo "${disk_name}" # This is the actual return value when a disk is found
    else
-        echo "No ${git_source} disk found." >&2
+        echo "No ${git_source} disk found with '${disk_search_pattern}' pattern." >&2
    fi
 }

-# Check if both $DISK_PREFIX and $DISK_SUFFIX are set, as they are required to find a cached disk image
+# Check if both $DISK_PREFIX and $DISK_SUFFIX are set, as they are required to
+# find a cached disk image.
 if [[ -n "${DISK_PREFIX}" && -n "${DISK_SUFFIX}" ]]; then
    # Find the most suitable cached disk image
-    echo "Finding the most suitable cached disk image..."
+    echo "Finding a ${DISK_PREFIX}-${DISK_SUFFIX} disk image for ${NETWORK}..."
    CACHED_DISK_NAME=""

-    # First, try to find a cached disk image from the current commit
-    CACHED_DISK_NAME=$(find_cached_disk_image ".+-${GITHUB_SHA_SHORT}" "commit")
-
-    # If no cached disk image is found
-    if [[ -z "${CACHED_DISK_NAME}" ]]; then
-        # Check if main branch images are preferred
-        if [[ "${PREFER_MAIN_CACHED_STATE}" == "true" ]]; then
-            CACHED_DISK_NAME=$(find_cached_disk_image "main-[0-9a-f]+" "main branch")
-        # Else, try to find one from any branch
-        else
-            CACHED_DISK_NAME=$(find_cached_disk_image ".+-[0-9a-f]+" "any branch")
-        fi
+    # Try to find an image based on the `main` branch if that branch is preferred.
+    if [[ "${PREFER_MAIN_CACHED_STATE}" == "true" ]]; then
+        CACHED_DISK_NAME=$(find_cached_disk_image "main-[0-9a-f]+" "main branch")
    fi
+    # If no image was found, try to find one from the current branch (or PR).
+    CACHED_DISK_NAME=${CACHED_DISK_NAME:-$(find_cached_disk_image ".+-${GITHUB_REF}" "branch")}
+    # If we still have no image, try to find one from any branch.
+    CACHED_DISK_NAME=${CACHED_DISK_NAME:-$(find_cached_disk_image ".+-[0-9a-f]+" "any branch")}

-    # Handle case where no suitable disk image is found
+    # Handle the case where no suitable disk image is found
    if [[ -z "${CACHED_DISK_NAME}" ]]; then
-        echo "No suitable cached state disk available."
-        echo "Cached state test jobs must depend on the cached state rebuild job."
+        echo "No suitable cached state disk available. Try running the cached state rebuild job."
        exit 1
+    else
+        echo "Selected Disk: ${CACHED_DISK_NAME}"
    fi

-    echo "Selected Disk: ${CACHED_DISK_NAME}"
 else
    echo "DISK_PREFIX or DISK_SUFFIX is not set. Skipping disk image search."
 fi
@ -77,7 +73,6 @@ find_available_disk_type() {
    local base_name="${1}"
    local disk_type="${2}"
    local disk_pattern="${base_name}-cache"
-    local output_var="${base_name}_${disk_type}_disk"
    local disk_name

    disk_name=$(gcloud compute images list --filter="status=READY AND name~${disk_pattern}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${disk_type}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
@ -87,10 +82,10 @@ find_available_disk_type() {
        echo "Found ${disk_type^^} disk: ${disk_name} for ${base_name^^} on network: ${NETWORK}" >&2
        disk_description=$(gcloud compute images describe "${disk_name}" --format="value(DESCRIPTION)")
        echo "Description: ${disk_description}" >&2
-        echo "true"  # This is the actual return value when a disk is found
+        echo "true" # This is the actual return value when a disk is found
    else
        echo "No ${disk_type^^} disk found for ${base_name^^} on network: ${NETWORK}" >&2
-        echo "false"  # This is the actual return value when no disk is found
+        echo "false" # This is the actual return value when no disk is found
    fi
 }
 if [[ -n "${NETWORK}" ]]; then
--- a/.github/workflows/sub-deploy-integration-tests-gcp.yml
+++ b/.github/workflows/sub-deploy-integration-tests-gcp.yml
@ -654,6 +654,7 @@ jobs:
      # (This is unlikely, because each image created by a workflow has a different name.)
      #
      # The image name must also be 63 characters or less.
+      # More info: https://cloud.google.com/compute/docs/naming-resources#resource-name-format
      #
      # Force the image creation (--force) as the disk is still attached even though is not being
      # used by the container.
--- a/.github/workflows/sub-find-cached-disks.yml
+++ b/.github/workflows/sub-find-cached-disks.yml
@ -74,20 +74,30 @@ jobs:
      - name: Set up Cloud SDK
        uses: google-github-actions/setup-gcloud@v2.1.1

-      # Disk images in GCP are required to be in lowercase, but the blockchain network
-      # uses sentence case, so we need to downcase ${{ inputs.network }}
+      # Performs formatting on disk name components.
      #
-      # Passes a lowercase Network name to subsequent steps using $NETWORK env variable
-      - name: Downcase network name for disks
+      # Disk images in GCP are required to be in lowercase, but the blockchain network
+      # uses sentence case, so we need to downcase ${{ inputs.network }}.
+      #
+      # Disk image names in GCP are limited to 63 characters, so we need to limit
+      # branch names to 12 characters.
+      # Check the `create-state-image` in `sub-deploy-integration-tests-gcp.yml` for more details in image names.
+      # More info: https://cloud.google.com/compute/docs/naming-resources#resource-name-format
+      #
+      # Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable.
+      # Passes ${{ env.GITHUB_REF_SLUG_URL }} to subsequent steps using $SHORT_GITHUB_REF env variable.
+      - name: Format network name and branch name for disks
        run: |
-          NETWORK_CAPS=${{ inputs.network }}
-          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+          NETWORK_CAPS="${{ inputs.network }}"
+          echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
+          LONG_GITHUB_REF="${{ env.GITHUB_REF_SLUG_URL }}"
+          echo "SHORT_GITHUB_REF=${LONG_GITHUB_REF:0:12}" >> "$GITHUB_ENV"

      # Check if there are cached state disks available for subsequent jobs to use.
      - name: Check if cached state disks exists
        id: get-available-disks
        env:
-          GITHUB_SHA_SHORT: ${{ env.GITHUB_SHA_SHORT }}
+          GITHUB_REF: ${{ env.SHORT_GITHUB_REF }}
          NETWORK: ${{ env.NETWORK }} # use lowercase version from env, not input
          DISK_PREFIX: ${{ inputs.disk_prefix }}
          DISK_SUFFIX: ${{ inputs.disk_suffix }}