From b3dc91b5cd6375d9e589f149d98985a895fffbc2 Mon Sep 17 00:00:00 2001 From: Oliver Frolovs Date: Thu, 14 Sep 2023 23:25:57 +0100 Subject: [PATCH] Upgrades to `monitoring_config` in `gke-cluster-*`, docs update, and cosmetics fixes to GKE cluster modules (#1680) * gke-cluster-standard: upgrade `monitoring_config` to use object style. Add tests. * gke-cluster-standard: update docs * gke-cluster-autopilot: move gateway_api_config block (cosmetic change) * gke-cluster-autopilot: update docs and fix typos * Update blueprints due to `monitoring_config` changes in `gke-cluster-standard`. * Update FAST due to `monitoring_config` changes in `gke-cluster-standard`. * Update docs for affected blueprints and FAST stages --- blueprints/gke/autopilot/cluster.tf | 5 +- blueprints/gke/multitenant-fleet/README.md | 30 ++--- blueprints/gke/multitenant-fleet/variables.tf | 18 ++- fast/stages/3-gke-multitenant/dev/README.md | 30 ++--- .../stages/3-gke-multitenant/dev/variables.tf | 17 ++- modules/gke-cluster-autopilot/README.md | 32 +++--- modules/gke-cluster-autopilot/main.tf | 16 +-- modules/gke-cluster-autopilot/outputs.tf | 2 +- modules/gke-cluster-autopilot/variables.tf | 4 +- modules/gke-cluster-standard/README.md | 108 +++++++++++++++--- modules/gke-cluster-standard/main.tf | 43 ++++--- modules/gke-cluster-standard/variables.tf | 26 ++++- .../monitoring-config-control-plane.yaml | 27 +++++ .../monitoring-config-disable-all.yaml | 23 ++++ 14 files changed, 270 insertions(+), 111 deletions(-) create mode 100644 tests/modules/gke_cluster_standard/examples/monitoring-config-control-plane.yaml create mode 100644 tests/modules/gke_cluster_standard/examples/monitoring-config-disable-all.yaml diff --git a/blueprints/gke/autopilot/cluster.tf b/blueprints/gke/autopilot/cluster.tf index a823a894..db4f29cc 100644 --- a/blueprints/gke/autopilot/cluster.tf +++ b/blueprints/gke/autopilot/cluster.tf @@ -30,8 +30,9 @@ module "cluster" { # autopilot = true # } # monitoring_config = { - # enenable_components = ["SYSTEM_COMPONENTS"] - # managed_prometheus = true + # enable_api_server_metrics = true + # enable_controller_manager_metrics = true + # enable_scheduler_metrics = true # } # cluster_autoscaling = { # auto_provisioning_defaults = { diff --git a/blueprints/gke/multitenant-fleet/README.md b/blueprints/gke/multitenant-fleet/README.md index baaf288f..8af50bc3 100644 --- a/blueprints/gke/multitenant-fleet/README.md +++ b/blueprints/gke/multitenant-fleet/README.md @@ -244,21 +244,21 @@ module "gke" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [billing_account_id](variables.tf#L17) | Billing account id. | string | ✓ | | -| [folder_id](variables.tf#L138) | Folder used for the GKE project in folders/nnnnnnnnnnn format. | string | ✓ | | -| [prefix](variables.tf#L189) | Prefix used for resource names. | string | ✓ | | -| [project_id](variables.tf#L198) | ID of the project that will contain all the clusters. | string | ✓ | | -| [vpc_config](variables.tf#L210) | Shared VPC project and VPC details. | object({…}) | ✓ | | -| [clusters](variables.tf#L22) | Clusters configuration. Refer to the gke-cluster module for type details. | map(object({…})) | | {} | -| [fleet_configmanagement_clusters](variables.tf#L76) | Config management features enabled on specific sets of member clusters, in config name => [cluster name] format. | map(list(string)) | | {} | -| [fleet_configmanagement_templates](variables.tf#L83) | Sets of config management configurations that can be applied to member clusters, in config name => {options} format. | map(object({…})) | | {} | -| [fleet_features](variables.tf#L118) | Enable and configure fleet features. Set to null to disable GKE Hub if fleet workload identity is not used. | object({…}) | | null | -| [fleet_workload_identity](variables.tf#L131) | Use Fleet Workload Identity for clusters. Enables GKE Hub if set to true. | bool | | false | -| [group_iam](variables.tf#L143) | Project-level IAM bindings for groups. Use group emails as keys, list of roles as values. | map(list(string)) | | {} | -| [iam](variables.tf#L150) | Project-level authoritative IAM bindings for users and service accounts in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | -| [labels](variables.tf#L157) | Project-level labels. | map(string) | | {} | -| [nodepools](variables.tf#L163) | Nodepools configuration. Refer to the gke-nodepool module for type details. | map(map(object({…}))) | | {} | -| [project_services](variables.tf#L203) | Additional project services to enable. | list(string) | | [] | +| [billing_account_id](variables.tf#L17) | Billing account ID. | string | ✓ | | +| [folder_id](variables.tf#L148) | Folder used for the GKE project in folders/nnnnnnnnnnn format. | string | ✓ | | +| [prefix](variables.tf#L199) | Prefix used for resource names. | string | ✓ | | +| [project_id](variables.tf#L208) | ID of the project that will contain all the clusters. | string | ✓ | | +| [vpc_config](variables.tf#L220) | Shared VPC project and VPC details. | object({…}) | ✓ | | +| [clusters](variables.tf#L22) | Clusters configuration. Refer to the gke-cluster module for type details. | map(object({…})) | | {} | +| [fleet_configmanagement_clusters](variables.tf#L86) | Config management features enabled on specific sets of member clusters, in config name => [cluster name] format. | map(list(string)) | | {} | +| [fleet_configmanagement_templates](variables.tf#L93) | Sets of config management configurations that can be applied to member clusters, in config name => {options} format. | map(object({…})) | | {} | +| [fleet_features](variables.tf#L128) | Enable and configure fleet features. Set to null to disable GKE Hub if fleet workload identity is not used. | object({…}) | | null | +| [fleet_workload_identity](variables.tf#L141) | Use Fleet Workload Identity for clusters. Enables GKE Hub if set to true. | bool | | false | +| [group_iam](variables.tf#L153) | Project-level IAM bindings for groups. Use group emails as keys, list of roles as values. | map(list(string)) | | {} | +| [iam](variables.tf#L160) | Project-level authoritative IAM bindings for users and service accounts in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | +| [labels](variables.tf#L167) | Project-level labels. | map(string) | | {} | +| [nodepools](variables.tf#L173) | Nodepools configuration. Refer to the gke-nodepool module for type details. | map(map(object({…}))) | | {} | +| [project_services](variables.tf#L213) | Additional project services to enable. | list(string) | | [] | ## Outputs diff --git a/blueprints/gke/multitenant-fleet/variables.tf b/blueprints/gke/multitenant-fleet/variables.tf index 2461ea8a..6f100802 100644 --- a/blueprints/gke/multitenant-fleet/variables.tf +++ b/blueprints/gke/multitenant-fleet/variables.tf @@ -15,7 +15,7 @@ */ variable "billing_account_id" { - description = "Billing account id." + description = "Billing account ID." type = string } @@ -48,9 +48,19 @@ variable "clusters" { max_pods_per_node = optional(number, 110) min_master_version = optional(string) monitoring_config = optional(object({ - enable_components = optional(list(string), ["SYSTEM_COMPONENTS"]) - managed_prometheus = optional(bool) - })) + enable_system_metrics = optional(bool, true) + + # Control plane metrics + enable_api_server_metrics = optional(bool, false) + enable_controller_manager_metrics = optional(bool, false) + enable_scheduler_metrics = optional(bool, false) + + # TODO add kube state metrics + + # Google Cloud Managed Service for Prometheus + enable_managed_prometheus = optional(bool, true) + }), {}) + node_locations = optional(list(string)) private_cluster_config = optional(any) release_channel = optional(string) diff --git a/fast/stages/3-gke-multitenant/dev/README.md b/fast/stages/3-gke-multitenant/dev/README.md index 411c4bc4..105add99 100644 --- a/fast/stages/3-gke-multitenant/dev/README.md +++ b/fast/stages/3-gke-multitenant/dev/README.md @@ -163,21 +163,21 @@ Leave all these variables unset (or set to `null`) to disable fleet management. |---|---|:---:|:---:|:---:|:---:| | [automation](variables.tf#L21) | Automation resources created by the bootstrap stage. | object({…}) | ✓ | | 0-bootstrap | | [billing_account](variables.tf#L29) | Billing account id. If billing account is not part of the same org set `is_org_level` to false. | object({…}) | ✓ | | 0-bootstrap | -| [folder_ids](variables.tf#L159) | Folders to be used for the networking resources in folders/nnnnnnnnnnn format. If null, folder will be created. | object({…}) | ✓ | | 1-resman | -| [host_project_ids](variables.tf#L174) | Host project for the shared VPC. | object({…}) | ✓ | | 2-networking | -| [prefix](variables.tf#L227) | Prefix used for resources that need unique names. | string | ✓ | | | -| [vpc_self_links](variables.tf#L243) | Self link for the shared VPC. | object({…}) | ✓ | | 2-networking | -| [clusters](variables.tf#L42) | Clusters configuration. Refer to the gke-cluster module for type details. | map(object({…})) | | {} | | -| [fleet_configmanagement_clusters](variables.tf#L96) | Config management features enabled on specific sets of member clusters, in config name => [cluster name] format. | map(list(string)) | | {} | | -| [fleet_configmanagement_templates](variables.tf#L104) | Sets of config management configurations that can be applied to member clusters, in config name => {options} format. | map(object({…})) | | {} | | -| [fleet_features](variables.tf#L139) | Enable and configure fleet features. Set to null to disable GKE Hub if fleet workload identity is not used. | object({…}) | | null | | -| [fleet_workload_identity](variables.tf#L152) | Use Fleet Workload Identity for clusters. Enables GKE Hub if set to true. | bool | | false | | -| [group_iam](variables.tf#L167) | Project-level authoritative IAM bindings for groups in {GROUP_EMAIL => [ROLES]} format. Use group emails as keys, list of roles as values. | map(list(string)) | | {} | | -| [iam](variables.tf#L182) | Project-level authoritative IAM bindings for users and service accounts in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | | -| [labels](variables.tf#L189) | Project-level labels. | map(string) | | {} | | -| [nodepools](variables.tf#L195) | Nodepools configuration. Refer to the gke-nodepool module for type details. | map(map(object({…}))) | | {} | | -| [outputs_location](variables.tf#L221) | Path where providers, tfvars files, and lists for the following stages are written. Leave empty to disable. | string | | null | | -| [project_services](variables.tf#L236) | Additional project services to enable. | list(string) | | [] | | +| [folder_ids](variables.tf#L168) | Folders to be used for the networking resources in folders/nnnnnnnnnnn format. If null, folder will be created. | object({…}) | ✓ | | 1-resman | +| [host_project_ids](variables.tf#L183) | Host project for the shared VPC. | object({…}) | ✓ | | 2-networking | +| [prefix](variables.tf#L236) | Prefix used for resources that need unique names. | string | ✓ | | | +| [vpc_self_links](variables.tf#L252) | Self link for the shared VPC. | object({…}) | ✓ | | 2-networking | +| [clusters](variables.tf#L42) | Clusters configuration. Refer to the gke-cluster-standard module for type details. | map(object({…})) | | {} | | +| [fleet_configmanagement_clusters](variables.tf#L105) | Config management features enabled on specific sets of member clusters, in config name => [cluster name] format. | map(list(string)) | | {} | | +| [fleet_configmanagement_templates](variables.tf#L113) | Sets of config management configurations that can be applied to member clusters, in config name => {options} format. | map(object({…})) | | {} | | +| [fleet_features](variables.tf#L148) | Enable and configure fleet features. Set to null to disable GKE Hub if fleet workload identity is not used. | object({…}) | | null | | +| [fleet_workload_identity](variables.tf#L161) | Use Fleet Workload Identity for clusters. Enables GKE Hub if set to true. | bool | | false | | +| [group_iam](variables.tf#L176) | Project-level authoritative IAM bindings for groups in {GROUP_EMAIL => [ROLES]} format. Use group emails as keys, list of roles as values. | map(list(string)) | | {} | | +| [iam](variables.tf#L191) | Project-level authoritative IAM bindings for users and service accounts in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | | +| [labels](variables.tf#L198) | Project-level labels. | map(string) | | {} | | +| [nodepools](variables.tf#L204) | Nodepools configuration. Refer to the gke-nodepool module for type details. | map(map(object({…}))) | | {} | | +| [outputs_location](variables.tf#L230) | Path where providers, tfvars files, and lists for the following stages are written. Leave empty to disable. | string | | null | | +| [project_services](variables.tf#L245) | Additional project services to enable. | list(string) | | [] | | ## Outputs diff --git a/fast/stages/3-gke-multitenant/dev/variables.tf b/fast/stages/3-gke-multitenant/dev/variables.tf index dffca0bf..4d177938 100644 --- a/fast/stages/3-gke-multitenant/dev/variables.tf +++ b/fast/stages/3-gke-multitenant/dev/variables.tf @@ -40,7 +40,7 @@ variable "billing_account" { } variable "clusters" { - description = "Clusters configuration. Refer to the gke-cluster module for type details." + description = "Clusters configuration. Refer to the gke-cluster-standard module for type details." type = map(object({ cluster_autoscaling = optional(any) description = optional(string) @@ -68,9 +68,18 @@ variable "clusters" { max_pods_per_node = optional(number, 110) min_master_version = optional(string) monitoring_config = optional(object({ - enable_components = optional(list(string), ["SYSTEM_COMPONENTS"]) - managed_prometheus = optional(bool) - })) + enable_system_metrics = optional(bool, true) + + # Control plane metrics + enable_api_server_metrics = optional(bool, false) + enable_controller_manager_metrics = optional(bool, false) + enable_scheduler_metrics = optional(bool, false) + + # TODO add kube state metrics + + # Google Cloud Managed Service for Prometheus + enable_managed_prometheus = optional(bool, true) + }), {}) node_locations = optional(list(string)) private_cluster_config = optional(any) release_channel = optional(string) diff --git a/modules/gke-cluster-autopilot/README.md b/modules/gke-cluster-autopilot/README.md index 2c008310..e5677a58 100644 --- a/modules/gke-cluster-autopilot/README.md +++ b/modules/gke-cluster-autopilot/README.md @@ -50,11 +50,11 @@ module "cluster-1" { ### Cloud DNS -This example shows how to [use Cloud DNS as a Kubernetes DNS provider](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-dns). - -> **Warning** +> [!WARNING] > [Cloud DNS is the only DNS provider for Autopilot clusters](https://cloud.google.com/kubernetes-engine/docs/concepts/service-discovery#cloud_dns) running version `1.25.9-gke.400` and later, and version `1.26.4-gke.500` and later. It is [pre-configured](https://cloud.google.com/kubernetes-engine/docs/resources/autopilot-standard-feature-comparison#feature-comparison) for those clusters. The following example *only* applies to Autopilot clusters running *earlier* versions. +This example shows how to [use Cloud DNS as a Kubernetes DNS provider](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-dns). + ```hcl module "cluster-1" { source = "./fabric/modules/gke-cluster-autopilot" @@ -79,11 +79,11 @@ module "cluster-1" { ### Logging configuration -This example shows how to [collect logs for the Kubernetes control plane components](https://cloud.google.com/stackdriver/docs/solutions/gke/installing). The logs for these components are not collected by default. - -> **Note** +> [!NOTE] > System and workload logs collection is pre-configured for Autopilot clusters and cannot be disabled. +This example shows how to [collect logs for the Kubernetes control plane components](https://cloud.google.com/stackdriver/docs/solutions/gke/installing). The logs for these components are not collected by default. + ```hcl module "cluster-1" { source = "./fabric/modules/gke-cluster-autopilot" @@ -106,14 +106,14 @@ module "cluster-1" { ### Monitoring configuration -This example shows how to [configure collection of Kubernetes control plane metrics](https://cloud.google.com/stackdriver/docs/solutions/gke/managing-metrics#enable-control-plane-metrics). The metrics for these components are not collected by default. - -> **Note** +> [!NOTE] > System metrics collection is pre-configured for Autopilot clusters and cannot be disabled. -> **Warning** +> [!WARNING] > GKE **workload metrics** is deprecated and removed in GKE 1.24 and later. Workload metrics is replaced by [Google Cloud Managed Service for Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus), which is Google's recommended way to monitor Kubernetes applications by using Cloud Monitoring. +This example shows how to [configure collection of Kubernetes control plane metrics](https://cloud.google.com/stackdriver/docs/solutions/gke/managing-metrics#enable-control-plane-metrics). The metrics for these components are not collected by default. + ```hcl module "cluster-1" { source = "./fabric/modules/gke-cluster-autopilot" @@ -136,14 +136,14 @@ module "cluster-1" { ### Backup for GKE +> [!NOTE] +> Although Backup for GKE can be enabled as an add-on when configuring your GKE clusters, it is a separate service from GKE. + [Backup for GKE](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/concepts/backup-for-gke) is a service for backing up and restoring workloads in GKE clusters. It has two components: * A [Google Cloud API](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/reference/rest) that serves as the control plane for the service. * A GKE add-on (the [Backup for GKE agent](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/concepts/backup-for-gke#agent_overview)) that must be enabled in each cluster for which you wish to perform backup and restore operations. -> **Note** -> Although Backup for GKE can be enabled as an add-on when configuring your GKE clusters, it is a separate service from GKE. - Backup for GKE is supported in GKE Autopilot clusters with [some restrictions](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/concepts/about-autopilot). This example shows how to [enable Backup for GKE on a new Autopilot cluster](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/how-to/install#enable_on_a_new_cluster_optional) and [plan a set of backups](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/how-to/backup-plan). @@ -176,9 +176,9 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [location](variables.tf#L110) | Autopilot cluster are always regional. | string | ✓ | | +| [location](variables.tf#L110) | Autopilot clusters are always regional. | string | ✓ | | | [name](variables.tf#L170) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L196) | Cluster project id. | string | ✓ | | +| [project_id](variables.tf#L196) | Cluster project ID. | string | ✓ | | | [vpc_config](variables.tf#L225) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | | [description](variables.tf#L37) | Cluster description. | string | | null | @@ -203,7 +203,7 @@ module "cluster-1" { | [ca_certificate](outputs.tf#L17) | Public certificate of the cluster (base64-encoded). | ✓ | | [cluster](outputs.tf#L23) | Cluster resource. | ✓ | | [endpoint](outputs.tf#L29) | Cluster endpoint. | | -| [id](outputs.tf#L34) | Fully qualified cluster id. | | +| [id](outputs.tf#L34) | Fully qualified cluster ID. | | | [location](outputs.tf#L39) | Cluster location. | | | [master_version](outputs.tf#L44) | Master version. | | | [name](outputs.tf#L49) | Cluster name. | | diff --git a/modules/gke-cluster-autopilot/main.tf b/modules/gke-cluster-autopilot/main.tf index 9af26acd..7948fd58 100644 --- a/modules/gke-cluster-autopilot/main.tf +++ b/modules/gke-cluster-autopilot/main.tf @@ -103,6 +103,13 @@ resource "google_container_cluster" "cluster" { } } + dynamic "gateway_api_config" { + for_each = var.enable_features.gateway_api ? [""] : [] + content { + channel = "CHANNEL_STANDARD" + } + } + dynamic "ip_allocation_policy" { for_each = var.vpc_config.secondary_range_blocks != null ? [""] : [] content { @@ -131,13 +138,6 @@ resource "google_container_cluster" "cluster" { ])) } - dynamic "gateway_api_config" { - for_each = var.enable_features.gateway_api ? [""] : [] - content { - channel = "CHANNEL_STANDARD" - } - } - maintenance_policy { dynamic "daily_maintenance_window" { for_each = ( @@ -207,7 +207,7 @@ resource "google_container_cluster" "cluster" { enable_components = toset(compact([ # System metrics collection cannot be disabled for Autopilot clusters. "SYSTEM_COMPONENTS", - # Control plane metrics. + # Control plane metrics: var.monitoring_config.enable_api_server_metrics ? "APISERVER" : null, var.monitoring_config.enable_controller_manager_metrics ? "CONTROLLER_MANAGER" : null, var.monitoring_config.enable_scheduler_metrics ? "SCHEDULER" : null, diff --git a/modules/gke-cluster-autopilot/outputs.tf b/modules/gke-cluster-autopilot/outputs.tf index 029ab06a..7978e55b 100644 --- a/modules/gke-cluster-autopilot/outputs.tf +++ b/modules/gke-cluster-autopilot/outputs.tf @@ -32,7 +32,7 @@ output "endpoint" { } output "id" { - description = "Fully qualified cluster id." + description = "Fully qualified cluster ID." value = google_container_cluster.cluster.id } diff --git a/modules/gke-cluster-autopilot/variables.tf b/modules/gke-cluster-autopilot/variables.tf index 37c054e3..bf410222 100644 --- a/modules/gke-cluster-autopilot/variables.tf +++ b/modules/gke-cluster-autopilot/variables.tf @@ -108,7 +108,7 @@ variable "labels" { } variable "location" { - description = "Autopilot cluster are always regional." + description = "Autopilot clusters are always regional." type = string } @@ -194,7 +194,7 @@ variable "private_cluster_config" { } variable "project_id" { - description = "Cluster project id." + description = "Cluster project ID." type = string } diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 0dfd6636..da5a8897 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -1,10 +1,29 @@ -# GKE cluster Standard module +# GKE Standard cluster module -This module allows simplified creation and management of GKE Standard clusters and should be used together with the GKE nodepool module, as the default nodepool is turned off here and cannot be re-enabled. Some sensible defaults are set initially, in order to allow less verbose usage for most use cases. +This module offers a way to create and manage Google Kubernetes Engine (GKE) [Standard clusters](https://cloud.google.com/kubernetes-engine/docs/concepts/choose-cluster-mode#why-standard). With its sensible default settings based on best practices and authors' experience as Google Cloud practitioners, the module accommodates for many common use cases out-of-the-box, without having to rely on verbose configuration. + +> [!IMPORTANT] +> This module should be used together with the [`gke-nodepool`](../gke-nodepool/) module because the default node pool is deleted upon cluster creation and cannot be re-created. + + +- [Example](#example) + - [GKE Standard cluster](#gke-standard-cluster) + - [Enable Dataplane V2](#enable-dataplane-v2) + - [Managing GKE logs](#managing-gke-logs) + - [Monitoring configuration](#monitoring-configuration) + - [Disable GKE logs or metrics collection](#disable-gke-logs-or-metrics-collection) + - [Cloud DNS](#cloud-dns) + - [Backup for GKE](#backup-for-gke) + - [Automatic creation of new secondary ranges](#automatic-creation-of-new-secondary-ranges) +- [Variables](#variables) +- [Outputs](#outputs) + ## Example -### GKE Cluster +### GKE Standard cluster + +This example shows how to [create a zonal GKE cluster in Standard mode](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-zonal-cluster). ```hcl module "cluster-1" { @@ -36,7 +55,9 @@ module "cluster-1" { # tftest modules=1 resources=1 inventory=basic.yaml ``` -### GKE Cluster with Dataplane V2 enabled +### Enable Dataplane V2 + +This example shows how to [create a zonal GKE Cluster with Dataplane V2 enabled](https://cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2). ```hcl module "cluster-1" { @@ -95,15 +116,40 @@ module "cluster-1" { # tftest modules=1 resources=1 inventory=logging-config-enable-all.yaml ``` -### Disable GKE logs collection +### Monitoring configuration -This example shows how to fully disable logs collection on a GKE Standard cluster. This is not recommended. +This example shows how to [configure collection of Kubernetes control plane metrics](https://cloud.google.com/stackdriver/docs/solutions/gke/managing-metrics#enable-control-plane-metrics). The metrics for these components are not collected by default. -> **Warning** +```hcl +module "cluster-1" { + source = "./fabric/modules/gke-cluster-standard" + project_id = "myproject" + name = "cluster-1" + location = "europe-west1-b" + vpc_config = { + network = var.vpc.self_link + subnetwork = var.subnet.self_link + secondary_range_names = {} + } + monitoring_config = { + enable_api_server_metrics = true + enable_controller_manager_metrics = true + enable_scheduler_metrics = true + } +} +# tftest modules=1 resources=1 inventory=monitoring-config-control-plane.yaml +``` + + +### Disable GKE logs or metrics collection + +> [!WARNING] > If you've disabled Cloud Logging or Cloud Monitoring, GKE customer support > is offered on a best-effort basis and might require additional effort > from your engineering team. +This example shows how to fully disable logs collection on a zonal GKE Standard cluster. This is not recommended. + ```hcl module "cluster-1" { source = "./fabric/modules/gke-cluster-standard" @@ -122,6 +168,27 @@ module "cluster-1" { # tftest modules=1 resources=1 inventory=logging-config-disable-all.yaml ``` +This example shows how to fully disable metrics collection on a zonal GKE Standard cluster. This is not recommended. + +```hcl +module "cluster-1" { + source = "./fabric/modules/gke-cluster-standard" + project_id = "myproject" + name = "cluster-1" + location = "europe-west1-b" + vpc_config = { + network = var.vpc.self_link + subnetwork = var.subnet.self_link + secondary_range_names = {} + } + monitoring_config = { + enable_system_metrics = false + enable_managed_prometheus = false + } +} +# tftest modules=1 resources=1 inventory=monitoring-config-disable-all.yaml +``` + ### Cloud DNS This example shows how to [use Cloud DNS as a Kubernetes DNS provider](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-dns) for GKE Standard clusters. @@ -150,7 +217,15 @@ module "cluster-1" { ### Backup for GKE -This example shows how to [enable the Backup for GKE agent and configure a Backup Plan](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/concepts/backup-for-gke) for GKE Standard clusters. +> [!NOTE] +> Although Backup for GKE can be enabled as an add-on when configuring your GKE clusters, it is a separate service from GKE. + +[Backup for GKE](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/concepts/backup-for-gke) is a service for backing up and restoring workloads in GKE clusters. It has two components: + +* A [Google Cloud API](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/reference/rest) that serves as the control plane for the service. +* A GKE add-on (the [Backup for GKE agent](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/concepts/backup-for-gke#agent_overview)) that must be enabled in each cluster for which you wish to perform backup and restore operations. + +This example shows how to [enable Backup for GKE on a new zonal GKE Standard cluster](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/how-to/install#enable_on_a_new_cluster_optional) and [plan a set of backups](https://cloud.google.com/kubernetes-engine/docs/add-on/backup-for-gke/how-to/backup-plan). ```hcl module "cluster-1" { @@ -197,16 +272,15 @@ module "cluster-1" { } # tftest modules=1 resources=1 ``` - ## Variables | name | description | type | required | default | |---|---|:---:|:---:|:---:| | [location](variables.tf#L138) | Cluster zone or region. | string | ✓ | | -| [name](variables.tf#L210) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L236) | Cluster project id. | string | ✓ | | -| [vpc_config](variables.tf#L253) | VPC-level configuration. | object({…}) | ✓ | | +| [name](variables.tf#L226) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L252) | Cluster project id. | string | ✓ | | +| [vpc_config](variables.tf#L269) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | | [cluster_autoscaling](variables.tf#L37) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | | [description](variables.tf#L58) | Cluster description. | string | | null | @@ -218,11 +292,11 @@ module "cluster-1" { | [maintenance_config](variables.tf#L164) | Maintenance window configuration. | object({…}) | | {…} | | [max_pods_per_node](variables.tf#L187) | Maximum number of pods per node in this cluster. | number | | 110 | | [min_master_version](variables.tf#L193) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L199) | Monitoring components. | object({…}) | | {…} | -| [node_locations](variables.tf#L215) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [private_cluster_config](variables.tf#L222) | Private cluster configuration. | object({…}) | | null | -| [release_channel](variables.tf#L241) | Release channel for GKE upgrades. | string | | null | -| [tags](variables.tf#L247) | Network tags applied to nodes. | list(string) | | null | +| [monitoring_config](variables.tf#L199) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_locations](variables.tf#L231) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [private_cluster_config](variables.tf#L238) | Private cluster configuration. | object({…}) | | null | +| [release_channel](variables.tf#L257) | Release channel for GKE upgrades. | string | | null | +| [tags](variables.tf#L263) | Network tags applied to nodes. | list(string) | | null | ## Outputs diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index 57d5454f..666de53f 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -40,8 +40,8 @@ resource "google_container_cluster" "cluster" { : "DATAPATH_PROVIDER_UNSPECIFIED" ) - # the default nodepool is deleted here, use the gke-nodepool module instead - # default nodepool configuration based on a shielded_nodes variable + # the default node pool is deleted here, use the gke-nodepool module instead. + # the default node pool configuration is based on a shielded_nodes variable. node_config { dynamic "shielded_instance_config" { for_each = var.enable_features.shielded_nodes ? [""] : [] @@ -164,6 +164,13 @@ resource "google_container_cluster" "cluster" { } } + dynamic "gateway_api_config" { + for_each = var.enable_features.gateway_api ? [""] : [] + content { + channel = "CHANNEL_STANDARD" + } + } + dynamic "ip_allocation_policy" { for_each = var.vpc_config.secondary_range_blocks != null ? [""] : [] content { @@ -205,13 +212,6 @@ resource "google_container_cluster" "cluster" { } } - dynamic "gateway_api_config" { - for_each = var.enable_features.gateway_api ? [""] : [] - content { - channel = "CHANNEL_STANDARD" - } - } - maintenance_policy { dynamic "daily_maintenance_window" { for_each = ( @@ -277,22 +277,21 @@ resource "google_container_cluster" "cluster" { } } - dynamic "monitoring_config" { - for_each = var.monitoring_config != null ? [""] : [] - content { - enable_components = var.monitoring_config.enable_components - dynamic "managed_prometheus" { - for_each = ( - try(var.monitoring_config.managed_prometheus, null) == true ? [""] : [] - ) - content { - enabled = true - } - } + monitoring_config { + enable_components = toset(compact([ + # System metrics is the minimum requirement if any other metrics are enabled. This is checked by input var validation. + var.monitoring_config.enable_system_metrics ? "SYSTEM_COMPONENTS" : null, + # Control plane metrics: + var.monitoring_config.enable_api_server_metrics ? "APISERVER" : null, + var.monitoring_config.enable_controller_manager_metrics ? "CONTROLLER_MANAGER" : null, + var.monitoring_config.enable_scheduler_metrics ? "SCHEDULER" : null, + ])) + managed_prometheus { + enabled = var.monitoring_config.enable_managed_prometheus } } - # dataplane v2 has built-in network policies + # Dataplane V2 has built-in network policies dynamic "network_policy" { for_each = ( var.enable_addons.network_policy && !var.enable_features.dataplane_v2 diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index cc1cb63f..53875961 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -197,13 +197,29 @@ variable "min_master_version" { } variable "monitoring_config" { - description = "Monitoring components." + description = "Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default." type = object({ - enable_components = optional(list(string)) - managed_prometheus = optional(bool) + enable_system_metrics = optional(bool, true) + + # Control plane metrics + enable_api_server_metrics = optional(bool, false) + enable_controller_manager_metrics = optional(bool, false) + enable_scheduler_metrics = optional(bool, false) + + # TODO add kube state metrics and validation + + # Google Cloud Managed Service for Prometheus + enable_managed_prometheus = optional(bool, true) }) - default = { - enable_components = ["SYSTEM_COMPONENTS"] + default = {} + nullable = false + validation { + condition = anytrue([ + var.monitoring_config.enable_api_server_metrics, + var.monitoring_config.enable_controller_manager_metrics, + var.monitoring_config.enable_scheduler_metrics, + ]) ? var.monitoring_config.enable_system_metrics : true + error_message = "System metrics are the minimum required component for enabling metrics collection." } } diff --git a/tests/modules/gke_cluster_standard/examples/monitoring-config-control-plane.yaml b/tests/modules/gke_cluster_standard/examples/monitoring-config-control-plane.yaml new file mode 100644 index 00000000..b3108770 --- /dev/null +++ b/tests/modules/gke_cluster_standard/examples/monitoring-config-control-plane.yaml @@ -0,0 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.cluster-1.google_container_cluster.cluster: + monitoring_config: + - enable_components: + - APISERVER + - CONTROLLER_MANAGER + - SCHEDULER + - SYSTEM_COMPONENTS + managed_prometheus: + - enabled: true + +counts: + google_container_cluster: 1 diff --git a/tests/modules/gke_cluster_standard/examples/monitoring-config-disable-all.yaml b/tests/modules/gke_cluster_standard/examples/monitoring-config-disable-all.yaml new file mode 100644 index 00000000..1b5576a4 --- /dev/null +++ b/tests/modules/gke_cluster_standard/examples/monitoring-config-disable-all.yaml @@ -0,0 +1,23 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.cluster-1.google_container_cluster.cluster: + monitoring_config: + - enable_components: [] + managed_prometheus: + - enabled: false + +counts: + google_container_cluster: 1