From 59657415be4c5c9cac731912ab42ee8761a72f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Tue, 9 Jul 2024 11:26:30 +0200 Subject: [PATCH] Adding TPU limits for GKE cluster node auto-provisioning (NAP) (#2406) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adding TPU limits for GKE cluster node auto-provisioning (NAP) * rework of the cluster autoscaling configuration * updated README * fixing README * Update modules/gke-cluster-standard/README.md Co-authored-by: Wiktor Niesiobędzki * fixing indentation --------- Co-authored-by: Wiktor Niesiobędzki --- modules/gke-cluster-standard/README.md | 44 ++++++++++++++++++++++- modules/gke-cluster-standard/main.tf | 12 +++---- modules/gke-cluster-standard/variables.tf | 8 ++--- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 40a6cdc6..17e1efb6 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -15,6 +15,7 @@ This module offers a way to create and manage Google Kubernetes Engine (GKE) [St - [Cloud DNS](#cloud-dns) - [Backup for GKE](#backup-for-gke) - [Automatic creation of new secondary ranges](#automatic-creation-of-new-secondary-ranges) + - [Node auto-provisioning with GPUs and TPUs](#node-auto-provisioning-with-gpus-and-tpus) - [Variables](#variables) - [Outputs](#outputs) @@ -305,6 +306,47 @@ module "cluster-1" { } # tftest modules=1 resources=1 ``` + +### Node auto-provisioning with GPUs and TPUs + +You can use `var.cluster_autoscaling` block to configure node auto-provisioning for the GKE cluster. The example below configures limits for CPU, memory, GPUs and TPUs. + +```hcl +module "cluster-1" { + source = "./fabric/modules/gke-cluster-standard" + project_id = var.project_id + name = "cluster-1" + location = "europe-west1-b" + vpc_config = { + network = var.vpc.self_link + subnetwork = var.subnet.self_link + secondary_range_blocks = { + pods = "" + services = "/20" # can be an empty string as well + } + } + cluster_autoscaling = { + cpu_limits = { + max = 48 + } + mem_limits = { + max = 182 + } + # Can be GPUs or TPUs + accelerator_resources = [ + { + resource_type = "nvidia-l4" + max = 2 + }, + { + resource_type = "tpu-v5-lite-podslice" + max = 2 + } + ] + } +} +# tftest modules=1 resources=1 +``` ## Variables @@ -315,7 +357,7 @@ module "cluster-1" { | [project_id](variables.tf#L410) | Cluster project id. | string | ✓ | | | [vpc_config](variables.tf#L421) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | -| [cluster_autoscaling](variables.tf#L39) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | +| [cluster_autoscaling](variables.tf#L39) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | | [default_nodepool](variables.tf#L118) | Enable default nodepool. | object({…}) | | {} | | [deletion_protection](variables.tf#L136) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | | [description](variables.tf#L143) | Cluster description. | string | | null | diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index 8cea6ceb..af836c54 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -222,15 +222,15 @@ resource "google_container_cluster" "cluster" { } dynamic "resource_limits" { for_each = ( - try(local.cas.gpu_resources, null) == null + try(local.cas.accelerator_resources, null) == null ? [] - : local.cas.gpu_resources + : local.cas.accelerator_resources ) - iterator = gpu_resources + iterator = accelerator_resources content { - resource_type = gpu_resources.value.resource_type - minimum = gpu_resources.value.min - maximum = gpu_resources.value.max + resource_type = accelerator_resources.value.resource_type + minimum = accelerator_resources.value.min + maximum = accelerator_resources.value.max } } } diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index 63e16df1..8dbf8101 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -73,16 +73,16 @@ variable "cluster_autoscaling" { # add validation rule to ensure only one is present if upgrade settings is defined })) cpu_limits = optional(object({ - min = number + min = optional(number, 0) max = number })) mem_limits = optional(object({ - min = number + min = optional(number, 0) max = number })) - gpu_resources = optional(list(object({ + accelerator_resources = optional(list(object({ resource_type = string - min = number + min = optional(number, 0) max = number }))) })