Adding placement_policy for GKE nodepools (ex: GPU compact placement or TPU topology) (#2405)

* Adding placement policy to gke-nodepool module

* Adding placement policy for GKE nodepool

* updated README

* variables for placement_policy

* formatting

* Updated README

* fixing typo

* removing useless trys

---------

Co-authored-by: Aurélien Legrand <legranda@google.com>
This commit is contained in:
Aurélien Legrand 2024-07-03 12:21:30 +02:00 committed by GitHub
parent 287fee275c
commit 1f07cb72f2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 22 additions and 10 deletions

View File

@ -143,7 +143,7 @@ module "cluster-1-nodepool-gpu-1" {
|---|---|:---:|:---:|:---:| |---|---|:---:|:---:|:---:|
| [cluster_name](variables.tf#L23) | Cluster name. | <code>string</code> | ✓ | | | [cluster_name](variables.tf#L23) | Cluster name. | <code>string</code> | ✓ | |
| [location](variables.tf#L48) | Cluster location. | <code>string</code> | ✓ | | | [location](variables.tf#L48) | Cluster location. | <code>string</code> | ✓ | |
| [project_id](variables.tf#L177) | Cluster project id. | <code>string</code> | ✓ | | | [project_id](variables.tf#L181) | Cluster project id. | <code>string</code> | ✓ | |
| [cluster_id](variables.tf#L17) | Cluster id. Optional, but providing cluster_id is recommended to prevent cluster misconfiguration in some of the edge cases. | <code>string</code> | | <code>null</code> | | [cluster_id](variables.tf#L17) | Cluster id. Optional, but providing cluster_id is recommended to prevent cluster misconfiguration in some of the edge cases. | <code>string</code> | | <code>null</code> |
| [gke_version](variables.tf#L28) | Kubernetes nodes version. Ignored if auto_upgrade is set in management_config. | <code>string</code> | | <code>null</code> | | [gke_version](variables.tf#L28) | Kubernetes nodes version. Ignored if auto_upgrade is set in management_config. | <code>string</code> | | <code>null</code> |
| [k8s_labels](variables.tf#L34) | Kubernetes labels applied to each node. | <code>map&#40;string&#41;</code> | | <code>&#123;&#125;</code> | | [k8s_labels](variables.tf#L34) | Kubernetes labels applied to each node. | <code>map&#40;string&#41;</code> | | <code>&#123;&#125;</code> |
@ -153,13 +153,13 @@ module "cluster-1-nodepool-gpu-1" {
| [node_config](variables.tf#L65) | Node-level configuration. | <code title="object&#40;&#123;&#10; boot_disk_kms_key &#61; optional&#40;string&#41;&#10; disk_size_gb &#61; optional&#40;number&#41;&#10; disk_type &#61; optional&#40;string&#41;&#10; ephemeral_ssd_count &#61; optional&#40;number&#41;&#10; gcfs &#61; optional&#40;bool, false&#41;&#10; guest_accelerator &#61; optional&#40;object&#40;&#123;&#10; count &#61; number&#10; type &#61; string&#10; gpu_driver &#61; optional&#40;object&#40;&#123;&#10; version &#61; string&#10; partition_size &#61; optional&#40;string&#41;&#10; max_shared_clients_per_gpu &#61; optional&#40;number&#41;&#10; &#125;&#41;&#41;&#10; &#125;&#41;&#41;&#10; local_nvme_ssd_count &#61; optional&#40;number&#41;&#10; gvnic &#61; optional&#40;bool, false&#41;&#10; image_type &#61; optional&#40;string&#41;&#10; kubelet_config &#61; optional&#40;object&#40;&#123;&#10; cpu_manager_policy &#61; string&#10; cpu_cfs_quota &#61; optional&#40;bool&#41;&#10; cpu_cfs_quota_period &#61; optional&#40;string&#41;&#10; pod_pids_limit &#61; optional&#40;number&#41;&#10; &#125;&#41;&#41;&#10; linux_node_config &#61; optional&#40;object&#40;&#123;&#10; sysctls &#61; optional&#40;map&#40;string&#41;&#41;&#10; cgroup_mode &#61; optional&#40;string&#41;&#10; &#125;&#41;&#41;&#10; local_ssd_count &#61; optional&#40;number&#41;&#10; machine_type &#61; optional&#40;string&#41;&#10; metadata &#61; optional&#40;map&#40;string&#41;&#41;&#10; min_cpu_platform &#61; optional&#40;string&#41;&#10; preemptible &#61; optional&#40;bool&#41;&#10; sandbox_config_gvisor &#61; optional&#40;bool&#41;&#10; shielded_instance_config &#61; optional&#40;object&#40;&#123;&#10; enable_integrity_monitoring &#61; optional&#40;bool&#41;&#10; enable_secure_boot &#61; optional&#40;bool&#41;&#10; &#125;&#41;&#41;&#10; spot &#61; optional&#40;bool&#41;&#10; workload_metadata_config_mode &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code title="&#123;&#10; disk_type &#61; &#34;pd-balanced&#34;&#10;&#125;">&#123;&#8230;&#125;</code> | | [node_config](variables.tf#L65) | Node-level configuration. | <code title="object&#40;&#123;&#10; boot_disk_kms_key &#61; optional&#40;string&#41;&#10; disk_size_gb &#61; optional&#40;number&#41;&#10; disk_type &#61; optional&#40;string&#41;&#10; ephemeral_ssd_count &#61; optional&#40;number&#41;&#10; gcfs &#61; optional&#40;bool, false&#41;&#10; guest_accelerator &#61; optional&#40;object&#40;&#123;&#10; count &#61; number&#10; type &#61; string&#10; gpu_driver &#61; optional&#40;object&#40;&#123;&#10; version &#61; string&#10; partition_size &#61; optional&#40;string&#41;&#10; max_shared_clients_per_gpu &#61; optional&#40;number&#41;&#10; &#125;&#41;&#41;&#10; &#125;&#41;&#41;&#10; local_nvme_ssd_count &#61; optional&#40;number&#41;&#10; gvnic &#61; optional&#40;bool, false&#41;&#10; image_type &#61; optional&#40;string&#41;&#10; kubelet_config &#61; optional&#40;object&#40;&#123;&#10; cpu_manager_policy &#61; string&#10; cpu_cfs_quota &#61; optional&#40;bool&#41;&#10; cpu_cfs_quota_period &#61; optional&#40;string&#41;&#10; pod_pids_limit &#61; optional&#40;number&#41;&#10; &#125;&#41;&#41;&#10; linux_node_config &#61; optional&#40;object&#40;&#123;&#10; sysctls &#61; optional&#40;map&#40;string&#41;&#41;&#10; cgroup_mode &#61; optional&#40;string&#41;&#10; &#125;&#41;&#41;&#10; local_ssd_count &#61; optional&#40;number&#41;&#10; machine_type &#61; optional&#40;string&#41;&#10; metadata &#61; optional&#40;map&#40;string&#41;&#41;&#10; min_cpu_platform &#61; optional&#40;string&#41;&#10; preemptible &#61; optional&#40;bool&#41;&#10; sandbox_config_gvisor &#61; optional&#40;bool&#41;&#10; shielded_instance_config &#61; optional&#40;object&#40;&#123;&#10; enable_integrity_monitoring &#61; optional&#40;bool&#41;&#10; enable_secure_boot &#61; optional&#40;bool&#41;&#10; &#125;&#41;&#41;&#10; spot &#61; optional&#40;bool&#41;&#10; workload_metadata_config_mode &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code title="&#123;&#10; disk_type &#61; &#34;pd-balanced&#34;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [node_count](variables.tf#L124) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | <code title="object&#40;&#123;&#10; current &#61; optional&#40;number&#41;&#10; initial &#61; number&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code title="&#123;&#10; initial &#61; 1&#10;&#125;">&#123;&#8230;&#125;</code> | | [node_count](variables.tf#L124) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | <code title="object&#40;&#123;&#10; current &#61; optional&#40;number&#41;&#10; initial &#61; number&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code title="&#123;&#10; initial &#61; 1&#10;&#125;">&#123;&#8230;&#125;</code> |
| [node_locations](variables.tf#L136) | Node locations. | <code>list&#40;string&#41;</code> | | <code>null</code> | | [node_locations](variables.tf#L136) | Node locations. | <code>list&#40;string&#41;</code> | | <code>null</code> |
| [nodepool_config](variables.tf#L142) | Nodepool-level configuration. | <code title="object&#40;&#123;&#10; autoscaling &#61; optional&#40;object&#40;&#123;&#10; location_policy &#61; optional&#40;string&#41;&#10; max_node_count &#61; optional&#40;number&#41;&#10; min_node_count &#61; optional&#40;number&#41;&#10; use_total_nodes &#61; optional&#40;bool, false&#41;&#10; &#125;&#41;&#41;&#10; management &#61; optional&#40;object&#40;&#123;&#10; auto_repair &#61; optional&#40;bool&#41;&#10; auto_upgrade &#61; optional&#40;bool&#41;&#10; &#125;&#41;&#41;&#10; upgrade_settings &#61; optional&#40;object&#40;&#123;&#10; max_surge &#61; number&#10; max_unavailable &#61; number&#10; &#125;&#41;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>null</code> | | [nodepool_config](variables.tf#L142) | Nodepool-level configuration. | <code title="object&#40;&#123;&#10; autoscaling &#61; optional&#40;object&#40;&#123;&#10; location_policy &#61; optional&#40;string&#41;&#10; max_node_count &#61; optional&#40;number&#41;&#10; min_node_count &#61; optional&#40;number&#41;&#10; use_total_nodes &#61; optional&#40;bool, false&#41;&#10; &#125;&#41;&#41;&#10; management &#61; optional&#40;object&#40;&#123;&#10; auto_repair &#61; optional&#40;bool&#41;&#10; auto_upgrade &#61; optional&#40;bool&#41;&#10; &#125;&#41;&#41;&#10; placement_policy &#61; optional&#40;object&#40;&#123;&#10; type &#61; string&#10; policy_name &#61; optional&#40;string&#41;&#10; tpu_topology &#61; optional&#40;string&#41;&#10; &#125;&#41;&#41;&#10; upgrade_settings &#61; optional&#40;object&#40;&#123;&#10; max_surge &#61; number&#10; max_unavailable &#61; number&#10; &#125;&#41;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>null</code> |
| [pod_range](variables.tf#L164) | Pod secondary range configuration. | <code title="object&#40;&#123;&#10; secondary_pod_range &#61; object&#40;&#123;&#10; name &#61; string&#10; cidr &#61; optional&#40;string&#41;&#10; create &#61; optional&#40;bool&#41;&#10; enable_private_nodes &#61; optional&#40;bool&#41;&#10; &#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>null</code> | | [pod_range](variables.tf#L168) | Pod secondary range configuration. | <code title="object&#40;&#123;&#10; secondary_pod_range &#61; object&#40;&#123;&#10; name &#61; string&#10; cidr &#61; optional&#40;string&#41;&#10; create &#61; optional&#40;bool&#41;&#10; enable_private_nodes &#61; optional&#40;bool&#41;&#10; &#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>null</code> |
| [reservation_affinity](variables.tf#L182) | Configuration of the desired reservation which instances could take capacity from. | <code title="object&#40;&#123;&#10; consume_reservation_type &#61; string&#10; key &#61; optional&#40;string&#41;&#10; values &#61; optional&#40;list&#40;string&#41;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>null</code> | | [reservation_affinity](variables.tf#L186) | Configuration of the desired reservation which instances could take capacity from. | <code title="object&#40;&#123;&#10; consume_reservation_type &#61; string&#10; key &#61; optional&#40;string&#41;&#10; values &#61; optional&#40;list&#40;string&#41;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>null</code> |
| [service_account](variables.tf#L192) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | <code title="object&#40;&#123;&#10; create &#61; optional&#40;bool, false&#41;&#10; email &#61; optional&#40;string&#41;&#10; oauth_scopes &#61; optional&#40;list&#40;string&#41;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> | | [service_account](variables.tf#L196) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | <code title="object&#40;&#123;&#10; create &#61; optional&#40;bool, false&#41;&#10; email &#61; optional&#40;string&#41;&#10; oauth_scopes &#61; optional&#40;list&#40;string&#41;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [sole_tenant_nodegroup](variables.tf#L203) | Sole tenant node group. | <code>string</code> | | <code>null</code> | | [sole_tenant_nodegroup](variables.tf#L207) | Sole tenant node group. | <code>string</code> | | <code>null</code> |
| [tags](variables.tf#L209) | Network tags applied to nodes. | <code>list&#40;string&#41;</code> | | <code>null</code> | | [tags](variables.tf#L213) | Network tags applied to nodes. | <code>list&#40;string&#41;</code> | | <code>null</code> |
| [taints](variables.tf#L215) | Kubernetes taints applied to all nodes. | <code title="map&#40;object&#40;&#123;&#10; value &#61; string&#10; effect &#61; string&#10;&#125;&#41;&#41;">map&#40;object&#40;&#123;&#8230;&#125;&#41;&#41;</code> | | <code>&#123;&#125;</code> | | [taints](variables.tf#L219) | Kubernetes taints applied to all nodes. | <code title="map&#40;object&#40;&#123;&#10; value &#61; string&#10; effect &#61; string&#10;&#125;&#41;&#41;">map&#40;object&#40;&#123;&#8230;&#125;&#41;&#41;</code> | | <code>&#123;&#125;</code> |
## Outputs ## Outputs

View File

@ -77,7 +77,6 @@ resource "google_container_node_pool" "nodepool" {
initial_node_count = var.node_count.initial initial_node_count = var.node_count.initial
node_count = var.node_count.current node_count = var.node_count.current
node_locations = var.node_locations node_locations = var.node_locations
# placement_policy = var.nodepool_config.placement_policy
dynamic "autoscaling" { dynamic "autoscaling" {
for_each = ( for_each = (
@ -129,6 +128,15 @@ resource "google_container_node_pool" "nodepool" {
} }
} }
dynamic "placement_policy" {
for_each = try(var.nodepool_config.placement_policy, null) != null ? [""] : []
content {
type = var.nodepool_config.placement_policy.type
policy_name = var.nodepool_config.placement_policy.policy_name
tpu_topology = var.nodepool_config.placement_policy.tpu_topology
}
}
node_config { node_config {
boot_disk_kms_key = var.node_config.boot_disk_kms_key boot_disk_kms_key = var.node_config.boot_disk_kms_key
disk_size_gb = var.node_config.disk_size_gb disk_size_gb = var.node_config.disk_size_gb

View File

@ -152,7 +152,11 @@ variable "nodepool_config" {
auto_repair = optional(bool) auto_repair = optional(bool)
auto_upgrade = optional(bool) auto_upgrade = optional(bool)
})) }))
# placement_policy = optional(bool) placement_policy = optional(object({
type = string
policy_name = optional(string)
tpu_topology = optional(string)
}))
upgrade_settings = optional(object({ upgrade_settings = optional(object({
max_surge = number max_surge = number
max_unavailable = number max_unavailable = number