From 0f446e89d49a9a801dc5ff932295c6ba476119b3 Mon Sep 17 00:00:00 2001 From: Tone Date: Fri, 10 Nov 2023 12:39:50 +0100 Subject: [PATCH] Extend `cluster_autoscaling` fields in gke-cluster-standard (#1845) * feat(gke-cluster-standard): Add feature to setup `cluster_autoscaling` * feat(gke-cluster-standard): Add GPUs setup feature for `cluster_autoscaling` * feat(gke-cluster-standard): Add validation for `autoscaling_profile` and `disk_type` to ensure only valid values are specified * feat(gke-cluster-standard): Fix validation condition for `cluster_autoscaling` --- modules/gke-cluster-standard/README.md | 42 +++++++++++------------ modules/gke-cluster-standard/main.tf | 32 ++++++++++++++++- modules/gke-cluster-standard/variables.tf | 24 +++++++++++++ 3 files changed, 76 insertions(+), 22 deletions(-) diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 53b57e8f..e895b481 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -310,28 +310,28 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [location](variables.tf#L154) | Cluster zone or region. | string | ✓ | | -| [name](variables.tf#L265) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L291) | Cluster project id. | string | ✓ | | -| [vpc_config](variables.tf#L314) | VPC-level configuration. | object({…}) | ✓ | | +| [location](variables.tf#L178) | Cluster zone or region. | string | ✓ | | +| [name](variables.tf#L289) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L315) | Cluster project id. | string | ✓ | | +| [vpc_config](variables.tf#L338) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | -| [cluster_autoscaling](variables.tf#L38) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | -| [deletion_protection](variables.tf#L59) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | -| [description](variables.tf#L66) | Cluster description. | string | | null | -| [enable_addons](variables.tf#L72) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | -| [enable_features](variables.tf#L96) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | -| [issue_client_certificate](variables.tf#L142) | Enable issuing client certificate. | bool | | false | -| [labels](variables.tf#L148) | Cluster resource labels. | map(string) | | null | -| [logging_config](variables.tf#L159) | Logging configuration. | object({…}) | | {} | -| [maintenance_config](variables.tf#L180) | Maintenance window configuration. | object({…}) | | {…} | -| [max_pods_per_node](variables.tf#L203) | Maximum number of pods per node in this cluster. | number | | 110 | -| [min_master_version](variables.tf#L209) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L215) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | -| [node_locations](variables.tf#L270) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [private_cluster_config](variables.tf#L277) | Private cluster configuration. | object({…}) | | null | -| [release_channel](variables.tf#L296) | Release channel for GKE upgrades. | string | | null | -| [service_account](variables.tf#L302) | Service account used for the default node pool, only useful if the default GCE service account has been disabled. | string | | null | -| [tags](variables.tf#L308) | Network tags applied to nodes. | list(string) | | null | +| [cluster_autoscaling](variables.tf#L38) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | +| [deletion_protection](variables.tf#L83) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | +| [description](variables.tf#L90) | Cluster description. | string | | null | +| [enable_addons](variables.tf#L96) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | +| [enable_features](variables.tf#L120) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | +| [issue_client_certificate](variables.tf#L166) | Enable issuing client certificate. | bool | | false | +| [labels](variables.tf#L172) | Cluster resource labels. | map(string) | | null | +| [logging_config](variables.tf#L183) | Logging configuration. | object({…}) | | {} | +| [maintenance_config](variables.tf#L204) | Maintenance window configuration. | object({…}) | | {…} | +| [max_pods_per_node](variables.tf#L227) | Maximum number of pods per node in this cluster. | number | | 110 | +| [min_master_version](variables.tf#L233) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | +| [monitoring_config](variables.tf#L239) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_locations](variables.tf#L294) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [private_cluster_config](variables.tf#L301) | Private cluster configuration. | object({…}) | | null | +| [release_channel](variables.tf#L320) | Release channel for GKE upgrades. | string | | null | +| [service_account](variables.tf#L326) | Service account used for the default node pool, only useful if the default GCE service account has been disabled. | string | | null | +| [tags](variables.tf#L332) | Network tags applied to nodes. | list(string) | | null | ## Outputs diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index f5d8fe75..42f115e2 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -123,13 +123,31 @@ resource "google_container_cluster" "cluster" { content { enabled = true + autoscaling_profile = var.cluster_autoscaling.autoscaling_profile + dynamic "auto_provisioning_defaults" { for_each = var.cluster_autoscaling.auto_provisioning_defaults != null ? [""] : [] content { boot_disk_kms_key = var.cluster_autoscaling.auto_provisioning_defaults.boot_disk_kms_key + disk_size = var.cluster_autoscaling.auto_provisioning_defaults.disk_size + disk_type = var.cluster_autoscaling.auto_provisioning_defaults.disk_type image_type = var.cluster_autoscaling.auto_provisioning_defaults.image_type oauth_scopes = var.cluster_autoscaling.auto_provisioning_defaults.oauth_scopes service_account = var.cluster_autoscaling.auto_provisioning_defaults.service_account + dynamic "management" { + for_each = var.cluster_autoscaling.auto_provisioning_defaults.management != null ? [""] : [] + content { + auto_repair = var.cluster_autoscaling.auto_provisioning_defaults.management.auto_repair + auto_upgrade = var.cluster_autoscaling.auto_provisioning_defaults.management.auto_upgrade + } + } + dynamic "shielded_instance_config" { + for_each = var.cluster_autoscaling.auto_provisioning_defaults.shielded_instance_config != null ? [""] : [] + content { + enable_integrity_monitoring = var.cluster_autoscaling.auto_provisioning_defaults.shielded_instance_config.integrity_monitoring + enable_secure_boot = var.cluster_autoscaling.auto_provisioning_defaults.shielded_instance_config.secure_boot + } + } } } dynamic "resource_limits" { @@ -148,7 +166,19 @@ resource "google_container_cluster" "cluster" { maximum = var.cluster_autoscaling.mem_limits.max } } - // TODO: support GPUs too + dynamic "resource_limits" { + for_each = ( + try(var.cluster_autoscaling.gpu_resources, null) == null + ? [] + : var.cluster_autoscaling.gpu_resources + ) + iterator = gpu_resources + content { + resource_type = gpu_resources.value.resource_type + minimum = gpu_resources.value.min + maximum = gpu_resources.value.max + } + } } } diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index 221f6b8a..eebd595a 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -38,11 +38,22 @@ variable "backup_configs" { variable "cluster_autoscaling" { description = "Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler." type = object({ + autoscaling_profile = optional(string, "BALANCED") auto_provisioning_defaults = optional(object({ boot_disk_kms_key = optional(string) + disk_size = optional(number) + disk_type = optional(string, "pd-standard") image_type = optional(string) oauth_scopes = optional(list(string)) service_account = optional(string) + management = optional(object({ + auto_repair = optional(bool, true) + auto_upgrade = optional(bool, true) + })) + shielded_instance_config = object({ + integrity_monitoring = optional(bool, true) + secure_boot = optional(bool, false) + }) })) cpu_limits = optional(object({ min = number @@ -52,8 +63,21 @@ variable "cluster_autoscaling" { min = number max = number })) + gpu_resources = optional(list(object({ + resource_type = string + min = number + max = number + }))) }) default = null + validation { + condition = (var.cluster_autoscaling == null ? true : contains(["BALANCED", "OPTIMIZE_UTILIZATION"], var.cluster_autoscaling.autoscaling_profile)) + error_message = "Invalid autoscaling_profile." + } + validation { + condition = (var.cluster_autoscaling == null ? true : contains(["pd-standard", "pd-ssd", "pd-balanced"], var.cluster_autoscaling.auto_provisioning_defaults.disk_type)) + error_message = "Invalid disk_type." + } } variable "deletion_protection" {