# Google Cloud Dataproc This module Manages a Google Cloud [Dataproc](https://cloud.google.com/dataproc) cluster resource, including IAM. - [TODO](#todo) - [Examples](#examples) - [Simple](#simple) - [Cluster configuration on GCE](#cluster-configuration-on-gce) - [Cluster configuration on GCE with CMEK encryption](#cluster-configuration-on-gce-with-cmek-encryption) - [Cluster configuration on GKE](#cluster-configuration-on-gke) - [IAM](#iam) - [Authoritative IAM](#authoritative-iam) - [Additive IAM](#additive-iam) - [Variables](#variables) - [Outputs](#outputs) - [Fixtures](#fixtures) ## TODO - [ ] Add support for Cloud Dataproc [autoscaling policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/dataproc_autoscaling_policy_iam). ## Examples ### Simple ```hcl module "dataproc-cluster" { source = "./fabric/modules/dataproc" project_id = var.project_id name = "my-cluster" region = var.region } # tftest modules=1 resources=1 ``` ### Cluster configuration on GCE To set cluster configuration use the 'dataproc_config.cluster_config' variable. If you don't want to use dedicated service account, remember to grant `roles/dataproc.worker` to Compute Default Service Account. ```hcl module "dataproc-service-account" { source = "./fabric/modules/iam-service-account" project_id = var.project_id name = "dataproc-worker" iam_project_roles = { (var.project_id) = ["roles/dataproc.worker"] } } module "firewall" { source = "./fabric/modules/net-vpc-firewall" project_id = var.project_id network = var.vpc.name ingress_rules = { allow-ingress-dataproc = { description = "Allow all traffic between Dataproc nodes." targets = ["dataproc"] sources = ["dataproc"] } } } module "processing-dp-cluster" { source = "./fabric/modules/dataproc" project_id = var.project_id name = "my-cluster" region = var.region dataproc_config = { cluster_config = { gce_cluster_config = { internal_ip_only = true service_account = module.dataproc-service-account.email service_account_scopes = ["cloud-platform"] subnetwork = var.subnet.self_link tags = ["dataproc"] zone = "${var.region}-b" } } } depends_on = [ module.dataproc-service-account, # ensure all grants are done before creating the cluster ] } # tftest modules=3 resources=7 e2e ``` ### Cluster configuration on GCE with CMEK encryption To set cluster configuration use the Customer Managed Encryption key, set `dataproc_config.encryption_config.` variable. The Compute Engine service agent and the Cloud Storage service agent need to have `CryptoKey Encrypter/Decrypter` role on they configured KMS key ([Documentation](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/customer-managed-encryption)). ```hcl module "dataproc-service-account" { source = "./fabric/modules/iam-service-account" project_id = var.project_id name = "dataproc-worker" iam_project_roles = { (var.project_id) = ["roles/dataproc.worker", "roles/cloudkms.cryptoKeyEncrypterDecrypter"] } } module "firewall" { source = "./fabric/modules/net-vpc-firewall" project_id = var.project_id network = var.vpc.name ingress_rules = { allow-ingress-dataproc = { description = "Allow all traffic between Dataproc nodes." targets = ["dataproc"] sources = ["dataproc"] } } } module "processing-dp-cluster" { source = "./fabric/modules/dataproc" project_id = var.project_id name = "my-cluster" region = var.region dataproc_config = { cluster_config = { gce_cluster_config = { internal_ip_only = true service_account = module.dataproc-service-account.email service_account_scopes = ["cloud-platform"] subnetwork = var.subnet.self_link tags = ["dataproc"] zone = "${var.region}-b" } } encryption_config = { kms_key_name = var.kms_key.id } } depends_on = [ module.dataproc-service-account, # ensure all grants are done before creating the cluster ] } # tftest modules=3 resources=8 e2e ``` ### Cluster configuration on GKE To set cluster configuration GKE use the 'dataproc_config.virtual_cluster_config' variable. This example shows usage of [dedicated Service Account](https://cloud.google.com/dataproc/docs/guides/dpgke/dataproc-gke-iam#custom_iam_configuration). ```hcl locals { dataproc_namespace = "foobar" } module "dataproc-service-account" { source = "./fabric/modules/iam-service-account" project_id = var.project_id name = "dataproc-worker" iam = { "roles/iam.workloadIdentityUser" = [ "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/agent]", "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-driver]", "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-executor]" ] } iam_project_roles = { (var.project_id) = ["roles/dataproc.worker"] } depends_on = [ module.gke-cluster-standard, # granting workloadIdentityUser requires cluster/pool to be created first ] } module "processing-dp-cluster" { source = "./fabric/modules/dataproc" project_id = var.project_id name = "my-dataproc-cluster" region = var.region dataproc_config = { virtual_cluster_config = { kubernetes_cluster_config = { kubernetes_namespace = local.dataproc_namespace kubernetes_software_config = { component_version = { "SPARK" : "3.1-dataproc-14" } properties = { "dataproc:dataproc.gke.agent.google-service-account" = module.dataproc-service-account.email "dataproc:dataproc.gke.spark.driver.google-service-account" = module.dataproc-service-account.email "dataproc:dataproc.gke.spark.executor.google-service-account" = module.dataproc-service-account.email } } gke_cluster_config = { gke_cluster_target = module.gke-cluster-standard.id node_pool_target = { node_pool = "node-pool-name" roles = ["DEFAULT"] } } } } } } # tftest modules=4 resources=6 fixtures=fixtures/gke-cluster-standard.tf e2e ``` ## IAM IAM is managed via several variables that implement different features and levels of control: - `iam` and `iam_by_principals` configure authoritative bindings that manage individual roles exclusively, and are internally merged - `iam_bindings` configure authoritative bindings with optional support for conditions, and are not internally merged with the previous two variables - `iam_bindings_additive` configure additive bindings via individual role/member pairs with optional support conditions The authoritative and additive approaches can be used together, provided different roles are managed by each. Some care must also be taken with the `iam_by_principals` variable to ensure that variable keys are static values, so that Terraform is able to compute the dependency graph. Refer to the [project module](../project/README.md#iam) for examples of the IAM interface. ### Authoritative IAM ```hcl module "processing-dp-cluster" { source = "./fabric/modules/dataproc" project_id = var.project_id name = "my-cluster" region = var.region iam_by_principals = { "group:gcp-data-engineers@example.net" = [ "roles/dataproc.viewer" ] } iam = { "roles/dataproc.viewer" = [ "serviceAccount:service-account@PROJECT_ID.iam.gserviceaccount.com" ] } } # tftest modules=1 resources=2 ``` ### Additive IAM ```hcl module "processing-dp-cluster" { source = "./fabric/modules/dataproc" project_id = var.project_id name = "my-cluster" region = var.region iam_bindings_additive = { am1-viewer = { member = "user:am1@example.com" role = "roles/dataproc.viewer" } } } # tftest modules=1 resources=2 ``` ## Variables | name | description | type | required | default | |---|---|:---:|:---:|:---:| | [name](variables.tf#L191) | Cluster name. | string | ✓ | | | [project_id](variables.tf#L196) | Project ID. | string | ✓ | | | [region](variables.tf#L201) | Dataproc region. | string | ✓ | | | [dataproc_config](variables.tf#L17) | Dataproc cluster config. | object({…}) | | {} | | [iam](variables-iam.tf#L24) | IAM bindings in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | | [iam_bindings](variables-iam.tf#L31) | Authoritative IAM bindings in {KEY => {role = ROLE, members = [], condition = {}}}. Keys are arbitrary. | map(object({…})) | | {} | | [iam_bindings_additive](variables-iam.tf#L46) | Individual additive IAM bindings. Keys are arbitrary. | map(object({…})) | | {} | | [iam_by_principals](variables-iam.tf#L17) | Authoritative IAM binding in {PRINCIPAL => [ROLES]} format. Principals need to be statically defined to avoid cycle errors. Merged internally with the `iam` variable. | map(list(string)) | | {} | | [labels](variables.tf#L185) | The resource labels for instance to use to annotate any related underlying resources, such as Compute Engine VMs. | map(string) | | {} | ## Outputs | name | description | sensitive | |---|---|:---:| | [id](outputs.tf#L30) | Fully qualified cluster id. | | | [name](outputs.tf#L45) | The name of the cluster. | | ## Fixtures - [gke-cluster-standard.tf](../../tests/fixtures/gke-cluster-standard.tf)