diff --git a/examples/data-solutions/data-platform-foundations/03-composer.tf b/examples/data-solutions/data-platform-foundations/03-composer.tf index 231d0cc5..fac47ec5 100644 --- a/examples/data-solutions/data-platform-foundations/03-composer.tf +++ b/examples/data-solutions/data-platform-foundations/03-composer.tf @@ -67,6 +67,7 @@ resource "google_composer_environment" "orch-cmp-0" { env_variables = merge( var.composer_config.env_variables, { BQ_LOCATION = var.location + DATA_CAT_TAGS = try(jsonencode(module.common-datacatalog.tags), "{}") DF_KMS_KEY = try(var.service_encryption_keys.dataflow, "") DTL_L0_PRJ = module.lake-0-project.project_id DTL_L0_BQ_DATASET = module.lake-0-bq-0.dataset_id diff --git a/examples/data-solutions/data-platform-foundations/05-datalake.tf b/examples/data-solutions/data-platform-foundations/05-datalake.tf index 64ec1b24..b163f9e5 100644 --- a/examples/data-solutions/data-platform-foundations/05-datalake.tf +++ b/examples/data-solutions/data-platform-foundations/05-datalake.tf @@ -23,6 +23,7 @@ locals { (local.groups.data-analysts) = [ "roles/bigquery.dataViewer", "roles/bigquery.jobUser", + "roles/bigquery.metadataViewer", "roles/bigquery.user", "roles/datacatalog.viewer", "roles/datacatalog.tagTemplateViewer", @@ -37,6 +38,7 @@ locals { (local.groups.data-analysts) = [ "roles/bigquery.dataEditor", "roles/bigquery.jobUser", + "roles/bigquery.metadataViewer", "roles/bigquery.user", "roles/datacatalog.viewer", "roles/datacatalog.tagTemplateViewer", @@ -44,7 +46,7 @@ locals { ] } lake_0_iam = { - "roles/bigquery.dataEditor" = [ + "roles/bigquery.dataOwner" = [ module.load-sa-df-0.iam_email, module.transf-sa-df-0.iam_email, module.transf-sa-bq-0.iam_email, @@ -52,18 +54,24 @@ locals { "roles/bigquery.jobUser" = [ module.load-sa-df-0.iam_email, ] + "roles/datacatalog.categoryAdmin" = [ + module.transf-sa-bq-0.iam_email + ] "roles/storage.objectCreator" = [ module.load-sa-df-0.iam_email, ] } lake_iam = { - "roles/bigquery.dataEditor" = [ + "roles/bigquery.dataOwner" = [ module.transf-sa-df-0.iam_email, module.transf-sa-bq-0.iam_email, ] "roles/bigquery.jobUser" = [ module.transf-sa-bq-0.iam_email, ] + "roles/datacatalog.categoryAdmin" = [ + module.load-sa-df-0.iam_email + ] "roles/storage.objectCreator" = [ module.transf-sa-df-0.iam_email, ] diff --git a/examples/data-solutions/data-platform-foundations/06-common.tf b/examples/data-solutions/data-platform-foundations/06-common.tf index cc18a46f..80451500 100644 --- a/examples/data-solutions/data-platform-foundations/06-common.tf +++ b/examples/data-solutions/data-platform-foundations/06-common.tf @@ -21,6 +21,9 @@ module "common-project" { prefix = var.prefix name = "cmn${local.project_suffix}" group_iam = { + (local.groups.data-analysts) = [ + "roles/datacatalog.viewer", + ] (local.groups.data-engineers) = [ "roles/dlp.reader", "roles/dlp.user", @@ -28,6 +31,7 @@ module "common-project" { ] (local.groups.data-security) = [ "roles/dlp.admin", + "roles/datacatalog.admin" ] } iam = { @@ -35,6 +39,17 @@ module "common-project" { module.load-sa-df-0.iam_email, module.transf-sa-df-0.iam_email ] + "roles/datacatalog.viewer" = [ + module.load-sa-df-0.iam_email, + module.transf-sa-df-0.iam_email, + module.transf-sa-bq-0.iam_email + ] + "roles/datacatalog.categoryFineGrainedReader" = [ + module.transf-sa-df-0.iam_email, + module.transf-sa-bq-0.iam_email, + # Uncomment if you want to grant access to `data-analyst` to all columns tagged. + # local.groups_iam.data-analysts + ] } services = concat(var.project_services, [ "datacatalog.googleapis.com", @@ -42,6 +57,16 @@ module "common-project" { ]) } +# Data Catalog Policy tag + +module "common-datacatalog" { + source = "../../../modules/data-catalog-policy-tag" + project_id = module.common-project.project_id + name = "${var.prefix}-datacatalog-policy-tags" + location = var.location + tags = var.data_catalog_tags +} + # To create KMS keys in the common projet: uncomment this section and assigne key links accondingly in local.service_encryption_keys variable # module "cmn-kms-0" { diff --git a/examples/data-solutions/data-platform-foundations/IAM.md b/examples/data-solutions/data-platform-foundations/IAM.md index aed1c405..d6ccbecb 100644 --- a/examples/data-solutions/data-platform-foundations/IAM.md +++ b/examples/data-solutions/data-platform-foundations/IAM.md @@ -6,45 +6,53 @@ Legend: + additive, conditional. | members | roles | |---|---| +|gcp-data-analysts
group|[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) | |gcp-data-engineers
group|[roles/dlp.estimatesAdmin](https://cloud.google.com/iam/docs/understanding-roles#dlp.estimatesAdmin)
[roles/dlp.reader](https://cloud.google.com/iam/docs/understanding-roles#dlp.reader)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | -|gcp-data-security
group|[roles/dlp.admin](https://cloud.google.com/iam/docs/understanding-roles#dlp.admin) | -|load-df-0
serviceAccount|[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | -|trf-df-0
serviceAccount|[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | +|gcp-data-security
group|[roles/datacatalog.admin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.admin)
[roles/dlp.admin](https://cloud.google.com/iam/docs/understanding-roles#dlp.admin) | +|load-df-0
serviceAccount|[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | +|trf-bq-0
serviceAccount|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) | +|trf-df-0
serviceAccount|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | ## Project dtl-0 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | -|load-df-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) | -|trf-bq-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) | -|trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|load-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) | +|trf-bq-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | +|trf-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) | ## Project dtl-1 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | -|trf-bq-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | -|trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|load-df-0
serviceAccount|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | +|trf-bq-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | +|trf-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | ## Project dtl-2 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | -|trf-bq-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | -|trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|load-df-0
serviceAccount|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | +|trf-bq-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | +|trf-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | ## Project dtl-plg | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|gcp-data-analysts
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| ## Project lnd @@ -62,6 +70,8 @@ Legend: + additive, conditional. | members | roles | |---|---| |gcp-data-engineers
group|[roles/compute.viewer](https://cloud.google.com/iam/docs/understanding-roles#compute.viewer)
[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin)
[roles/dataflow.developer](https://cloud.google.com/iam/docs/understanding-roles#dataflow.developer)
[roles/viewer](https://cloud.google.com/iam/docs/understanding-roles#viewer) | +|SERVICE_IDENTITY_dataflow-service-producer-prod
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| |load-df-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin)
[roles/dataflow.worker](https://cloud.google.com/iam/docs/understanding-roles#dataflow.worker)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | |orc-cmp-0
serviceAccount|[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) | @@ -69,7 +79,9 @@ Legend: + additive, conditional. | members | roles | |---|---| -|gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/cloudbuild.builds.editor](https://cloud.google.com/iam/docs/understanding-roles#cloudbuild.builds.editor)
[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin)
[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin)
[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/cloudbuild.builds.editor](https://cloud.google.com/iam/docs/understanding-roles#cloudbuild.builds.editor)
[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin)
[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_cloudcomposer-accounts
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| |load-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |orc-cmp-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/composer.worker](https://cloud.google.com/iam/docs/understanding-roles#composer.worker)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | |trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) | @@ -79,6 +91,8 @@ Legend: + additive, conditional. | members | roles | |---|---| |gcp-data-engineers
group|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) | +|SERVICE_IDENTITY_dataflow-service-producer-prod
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| |orc-cmp-0
serviceAccount|[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) | |trf-bq-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | |trf-df-0
serviceAccount|[roles/dataflow.worker](https://cloud.google.com/iam/docs/understanding-roles#dataflow.worker)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | diff --git a/examples/data-solutions/data-platform-foundations/README.md b/examples/data-solutions/data-platform-foundations/README.md index 8b243251..09f8e63a 100644 --- a/examples/data-solutions/data-platform-foundations/README.md +++ b/examples/data-solutions/data-platform-foundations/README.md @@ -154,6 +154,19 @@ Cloud Data Loss Prevention resources and templates should be stored in the secur You can find more details and best practices on using DLP to De-identification and re-identification of PII in large-scale datasets in the [GCP documentation](https://cloud.google.com/architecture/de-identification-re-identification-pii-using-cloud-dlp). +## Data Catalog + +[Data Catalog](https://cloud.google.com/data-catalog) helps you to document your data entry at scale. Data Catalog relies on [tags](https://cloud.google.com/data-catalog/docs/tags-and-tag-templates#tags) and [tag template](https://cloud.google.com/data-catalog/docs/tags-and-tag-templates#tag-templates) to manage metadata for all data entries in a unified and centralized service. To implement [column-level security](https://cloud.google.com/bigquery/docs/column-level-security-intro) on BigQuery, we suggest to use `Tags` and `Tag templates`. + +The default configuration will implement 3 tags: + - `3_Confidential`: policy tag for columns that include very sensitive information, such as credit card numbers. + - `2_Private`: policy tag for columns that include sensitive personal identifiable information (PII) information, such as a person's first name. + - `1_Sensitive`: policy tag for columns that include data that cannot be made public, such as the credit limit. + +Anything that is not tagged is available to all users who have access to the data warehouse. + +For the porpuse of the example no groups has access to tagged data. You can configure your tags and roles associated by configuring the `data_catalog_tags` variable. We suggest useing the "[Best practices for using policy tags in BigQuery](https://cloud.google.com/bigquery/docs/best-practices-policy-tags)" article as a guide to designing your tags structure and access pattern. + ## How to run this script To deploy this example on your GCP organization, you will need @@ -207,17 +220,10 @@ To do this, you need to remove IAM binging at project-level for the `data-analys ## Demo pipeline -The application layer is out of scope of this script, but as a demo, it is provided with a Cloud Composer DAG to mode data from the `landing` area to the `DataLake L2` dataset. +The application layer is out of scope of this script. As a demo purpuse only, several Cloud Composer DAGs are provided. Demos will import data from the `landing` area to the `DataLake L2` dataset suing different features. -Just follow the commands you find in the `demo_commands` Terraform output, go in the Cloud Composer UI and run the `data_pipeline_dag`. +You can find examples in the `[demo](./demo)` folder. -Description of commands: - -- 01: copy sample data to a `landing` Cloud Storage bucket impersonating the `load` service account. -- 02: copy sample data structure definition in the `orchestration` Cloud Storage bucket impersonating the `orchestration` service account. -- 03: copy the Cloud Composer DAG to the Cloud Composer Storage bucket impersonating the `orchestration` service account. -- 04: Open the Cloud Composer Airflow UI and run the imported DAG. -- 05: Run the BigQuery query to see results. ## Variables @@ -225,17 +231,18 @@ Description of commands: | name | description | type | required | default | |---|---|:---:|:---:|:---:| | [billing_account_id](variables.tf#L17) | Billing account id. | string | ✓ | | -| [folder_id](variables.tf#L42) | Folder to be used for the networking resources in folders/nnnn format. | string | ✓ | | -| [organization_domain](variables.tf#L87) | Organization domain. | string | ✓ | | -| [prefix](variables.tf#L92) | Unique prefix used for resource names. | string | ✓ | | +| [folder_id](variables.tf#L53) | Folder to be used for the networking resources in folders/nnnn format. | string | ✓ | | +| [organization_domain](variables.tf#L98) | Organization domain. | string | ✓ | | +| [prefix](variables.tf#L103) | Unique prefix used for resource names. | string | ✓ | | | [composer_config](variables.tf#L22) | Cloud Composer config. | object({…}) | | {…} | -| [data_force_destroy](variables.tf#L36) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | bool | | false | -| [groups](variables.tf#L53) | User groups. | map(string) | | {…} | -| [location](variables.tf#L47) | Location used for multi-regional resources. | string | | "eu" | -| [network_config](variables.tf#L63) | Shared VPC network configurations to use. If null networks will be created in projects with preconfigured values. | object({…}) | | null | -| [project_services](variables.tf#L97) | List of core services enabled on all projects. | list(string) | | […] | -| [project_suffix](variables.tf#L108) | Suffix used only for project ids. | string | | null | -| [region](variables.tf#L114) | Region used for regional resources. | string | | "europe-west1" | +| [data_catalog_tags](variables.tf#L36) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | map(map(list(string))) | | {…} | +| [data_force_destroy](variables.tf#L47) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | bool | | false | +| [groups](variables.tf#L64) | User groups. | map(string) | | {…} | +| [location](variables.tf#L58) | Location used for multi-regional resources. | string | | "eu" | +| [network_config](variables.tf#L74) | Shared VPC network configurations to use. If null networks will be created in projects with preconfigured values. | object({…}) | | null | +| [project_services](variables.tf#L108) | List of core services enabled on all projects. | list(string) | | […] | +| [project_suffix](variables.tf#L119) | Suffix used only for project ids. | string | | null | +| [region](variables.tf#L125) | Region used for regional resources. | string | | "europe-west1" | ## Outputs @@ -254,13 +261,6 @@ Description of commands: Features to add in future releases: -- Add support for Column level access on BigQuery -- Add example templates for Data Catalog - Add example on how to use Cloud Data Loss Prevention - Add solution to handle Tables, Views, and Authorized Views lifecycle - Add solution to handle Metadata lifecycle - -## To Test/Fix - -- Composer require "Require OS Login" not enforced -- External Shared-VPC diff --git a/examples/data-solutions/data-platform-foundations/demo/README.md b/examples/data-solutions/data-platform-foundations/demo/README.md index 78297f7a..5347b2cf 100644 --- a/examples/data-solutions/data-platform-foundations/demo/README.md +++ b/examples/data-solutions/data-platform-foundations/demo/README.md @@ -1,3 +1,32 @@ # Data ingestion Demo -In this folder you can find an example to ingest data on the `data platfoem` instantiated in [here](../). See details in the [README.m](../#demo-pipeline) to run the demo. \ No newline at end of file +In this folder, you can find an example to ingest data on the `data platform` instantiated [here](../). + +The example is not intended to be a production-ready code. + +## Demo use case +The demo imports purchase data generated by a store. + +## Input files +Data are uploaded to the `landing` GCS bucket. File structure: + - `customers.csv`: Comma separate value with customer information in the following format: Customer ID, Name, Surname, Registration Timestamp + - `purchases.csv`: Comma separate value with customer information in the following format: Item ID, Customer ID, Item, Item price, Purchase Timestamp + +## Data processing pipelines +Different data pipelines are provided to highlight different features and patterns. For the purpose of the example, a single pipeline handle all data lifecycles. When adapting them to your real use case, you may want to evaluate the option to handle each functional step on a separate pipeline or a dedicated tool. For example, you may want to use `Dataform` to handle data schemas lifecycle. + +Below you can find a description of each example: + - Simple import data: [`datapipeline.py`](./datapipeline.py) is a simple pipeline to import provided data from the `landing` Google Cloud Storage bucket to the Data Hub L2 layer joining `customers` and `purchases` tables into `customerpurchase` table. + - Import data with Policy Tags: [`datapipeline_dc_tags.py`](./datapipeline.py) imports provided data from `landing` bucket to the Data Hub L2 layer protecting sensitive data using Data Catalog policy Tags. + - Delete tables: [`delete_table.py`](./delete_table.py) deletes BigQuery tables created by import pipelines. + +## Runnin the demo +To run demo examples, please follow the following steps: + +- 01: copy sample data to the `landing` Cloud Storage bucket impersonating the `load` service account. +- 02: copy sample data structure definition in the `orchestration` Cloud Storage bucket impersonating the `orchestration` service account. +- 03: copy the Cloud Composer DAG to the Cloud Composer Storage bucket impersonating the `orchestration` service account. +- 04: Open the Cloud Composer Airflow UI and run the imported DAG. +- 05: Run the BigQuery query to see results. + +You can find pre-computed commands in the `demo_commands` output variable of the deployed terraform [data pipeline](../). diff --git a/examples/data-solutions/data-platform-foundations/demo/datapipeline.py b/examples/data-solutions/data-platform-foundations/demo/datapipeline.py index fd633ebd..1f748c08 100644 --- a/examples/data-solutions/data-platform-foundations/demo/datapipeline.py +++ b/examples/data-solutions/data-platform-foundations/demo/datapipeline.py @@ -19,18 +19,21 @@ import csv import datetime import io +import json import logging import os from airflow import models -from airflow.contrib.operators.dataflow_operator import DataflowTemplateOperator +from airflow.providers.google.cloud.operators.dataflow import DataflowTemplatedJobStartOperator from airflow.operators import dummy -from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator +from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryUpsertTableOperator, BigQueryUpdateTableSchemaOperator +from airflow.utils.task_group import TaskGroup # -------------------------------------------------------------------------------- -# Set variables -# ------------------------------------------------------------ +# Set variables - Needed for the DEMO +# -------------------------------------------------------------------------------- BQ_LOCATION = os.environ.get("BQ_LOCATION") +DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS")) DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ") DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET") DTL_L0_GCS = os.environ.get("DTL_L0_GCS") @@ -84,7 +87,6 @@ default_args = { 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'dataflow_default_options': { - 'project': LOD_PRJ, 'location': DF_REGION, 'zone': DF_ZONE, 'stagingLocation': LOD_GCS_STAGING, @@ -114,9 +116,13 @@ with models.DAG( trigger_rule='all_success' ) - customers_import = DataflowTemplateOperator( - task_id="dataflow_customer_import", + # Bigquery Tables automatically created for demo porpuse. + # Consider a dedicated pipeline or tool for a real life scenario. + customers_import = DataflowTemplatedJobStartOperator( + task_id="dataflow_customers_import", template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", + project_id=LOD_PRJ, + location=DF_REGION, parameters={ "javascriptTextTransformFunctionName": "transform", "JSONPath": ORC_GCS + "/customers_schema.json", @@ -127,9 +133,11 @@ with models.DAG( }, ) - purchases_import = DataflowTemplateOperator( + purchases_import = DataflowTemplatedJobStartOperator( task_id="dataflow_purchases_import", template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", + project_id=LOD_PRJ, + location=DF_REGION, parameters={ "javascriptTextTransformFunctionName": "transform", "JSONPath": ORC_GCS + "/purchases_schema.json", @@ -180,13 +188,13 @@ with models.DAG( 'jobType':'QUERY', 'query':{ 'query':"""SELECT - customer_id, - purchase_id, - name, - surname, - item, - price, - timestamp + customer_id, + purchase_id, + name, + surname, + item, + price, + timestamp FROM `{dtl_1_prj}.{dtl_1_dataset}.customer_purchase` """.format(dtl_1_prj=DTL_L1_PRJ, dtl_1_dataset=DTL_L1_BQ_DATASET, ), 'destinationTable':{ @@ -201,4 +209,4 @@ with models.DAG( impersonation_chain=[TRF_SA_BQ] ) - start >> [customers_import, purchases_import] >> join_customer_purchase >> l2_customer_purchase >> end + start >> [customers_import, purchases_import] >> join_customer_purchase >> l2_customer_purchase >> end \ No newline at end of file diff --git a/examples/data-solutions/data-platform-foundations/demo/datapipeline_dc_tags.py b/examples/data-solutions/data-platform-foundations/demo/datapipeline_dc_tags.py new file mode 100644 index 00000000..2fb88c9e --- /dev/null +++ b/examples/data-solutions/data-platform-foundations/demo/datapipeline_dc_tags.py @@ -0,0 +1,322 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -------------------------------------------------------------------------------- +# Load The Dependencies +# -------------------------------------------------------------------------------- + +import csv +import datetime +import io +import json +import logging +import os + +from airflow import models +from airflow.providers.google.cloud.operators.dataflow import DataflowTemplatedJobStartOperator +from airflow.operators import dummy +from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryUpsertTableOperator, BigQueryUpdateTableSchemaOperator +from airflow.utils.task_group import TaskGroup + +# -------------------------------------------------------------------------------- +# Set variables - Needed for the DEMO +# -------------------------------------------------------------------------------- +BQ_LOCATION = os.environ.get("BQ_LOCATION") +DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS")) +DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ") +DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET") +DTL_L0_GCS = os.environ.get("DTL_L0_GCS") +DTL_L1_PRJ = os.environ.get("DTL_L1_PRJ") +DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET") +DTL_L1_GCS = os.environ.get("DTL_L1_GCS") +DTL_L2_PRJ = os.environ.get("DTL_L2_PRJ") +DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET") +DTL_L2_GCS = os.environ.get("DTL_L2_GCS") +DTL_PLG_PRJ = os.environ.get("DTL_PLG_PRJ") +DTL_PLG_BQ_DATASET = os.environ.get("DTL_PLG_BQ_DATASET") +DTL_PLG_GCS = os.environ.get("DTL_PLG_GCS") +GCP_REGION = os.environ.get("GCP_REGION") +LND_PRJ = os.environ.get("LND_PRJ") +LND_BQ = os.environ.get("LND_BQ") +LND_GCS = os.environ.get("LND_GCS") +LND_PS = os.environ.get("LND_PS") +LOD_PRJ = os.environ.get("LOD_PRJ") +LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING") +LOD_NET_VPC = os.environ.get("LOD_NET_VPC") +LOD_NET_SUBNET = os.environ.get("LOD_NET_SUBNET") +LOD_SA_DF = os.environ.get("LOD_SA_DF") +ORC_PRJ = os.environ.get("ORC_PRJ") +ORC_GCS = os.environ.get("ORC_GCS") +TRF_PRJ = os.environ.get("TRF_PRJ") +TRF_GCS_STAGING = os.environ.get("TRF_GCS_STAGING") +TRF_NET_VPC = os.environ.get("TRF_NET_VPC") +TRF_NET_SUBNET = os.environ.get("TRF_NET_SUBNET") +TRF_SA_DF = os.environ.get("TRF_SA_DF") +TRF_SA_BQ = os.environ.get("TRF_SA_BQ") +DF_KMS_KEY = os.environ.get("DF_KMS_KEY", "") +DF_REGION = os.environ.get("GCP_REGION") +DF_ZONE = os.environ.get("GCP_REGION") + "-b" + +# -------------------------------------------------------------------------------- +# Set default arguments +# -------------------------------------------------------------------------------- + +# If you are running Airflow in more than one time zone +# see https://airflow.apache.org/docs/apache-airflow/stable/timezone.html +# for best practices +yesterday = datetime.datetime.now() - datetime.timedelta(days=1) + +default_args = { + 'owner': 'airflow', + 'start_date': yesterday, + 'depends_on_past': False, + 'email': [''], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': datetime.timedelta(minutes=5), + 'dataflow_default_options': { + 'location': DF_REGION, + 'zone': DF_ZONE, + 'stagingLocation': LOD_GCS_STAGING, + 'tempLocation': LOD_GCS_STAGING + "/tmp", + 'serviceAccountEmail': LOD_SA_DF, + 'subnetwork': LOD_NET_SUBNET, + 'ipConfiguration': "WORKER_IP_PRIVATE", + 'kmsKeyName' : DF_KMS_KEY + }, +} + +# -------------------------------------------------------------------------------- +# Main DAG +# -------------------------------------------------------------------------------- + +with models.DAG( + 'data_pipeline_dc_tags_dag', + default_args=default_args, + schedule_interval=None) as dag: + start = dummy.DummyOperator( + task_id='start', + trigger_rule='all_success' + ) + + end = dummy.DummyOperator( + task_id='end', + trigger_rule='all_success' + ) + + # Bigquery Tables created here for demo porpuse. + # Consider a dedicated pipeline or tool for a real life scenario. + with TaskGroup('upsert_table') as upsert_table: + upsert_table_customers = BigQueryUpsertTableOperator( + task_id="upsert_table_customers", + project_id=DTL_L0_PRJ, + dataset_id=DTL_L0_BQ_DATASET, + impersonation_chain=[TRF_SA_DF], + table_resource={ + "tableReference": {"tableId": "customers"}, + }, + ) + + upsert_table_purchases = BigQueryUpsertTableOperator( + task_id="upsert_table_purchases", + project_id=DTL_L0_PRJ, + dataset_id=DTL_L0_BQ_DATASET, + impersonation_chain=[TRF_SA_BQ], + table_resource={ + "tableReference": {"tableId": "purchases"} + }, + ) + + upsert_table_customer_purchase_l1 = BigQueryUpsertTableOperator( + task_id="upsert_table_customer_purchase_l1", + project_id=DTL_L1_PRJ, + dataset_id=DTL_L1_BQ_DATASET, + impersonation_chain=[TRF_SA_BQ], + table_resource={ + "tableReference": {"tableId": "customer_purchase"} + }, + ) + + upsert_table_customer_purchase_l2 = BigQueryUpsertTableOperator( + task_id="upsert_table_customer_purchase_l2", + project_id=DTL_L2_PRJ, + dataset_id=DTL_L2_BQ_DATASET, + impersonation_chain=[TRF_SA_BQ], + table_resource={ + "tableReference": {"tableId": "customer_purchase"} + }, + ) + + # Bigquery Tables schema defined here for demo porpuse. + # Consider a dedicated pipeline or tool for a real life scenario. + with TaskGroup('update_schema_table') as update_schema_table: + update_table_schema_customers = BigQueryUpdateTableSchemaOperator( + task_id="update_table_schema_customers", + project_id=DTL_L0_PRJ, + dataset_id=DTL_L0_BQ_DATASET, + table_id="customers", + impersonation_chain=[TRF_SA_BQ], + include_policy_tags=True, + schema_fields_updates=[ + { "mode": "REQUIRED", "name": "id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "name", "type": "STRING", "description": "Name", "policyTags": { "names": [DATA_CAT_TAGS.get('2_Private', None)]}}, + { "mode": "REQUIRED", "name": "surname", "type": "STRING", "description": "Surname", "policyTags": { "names": [DATA_CAT_TAGS.get('2_Private', None)]} }, + { "mode": "REQUIRED", "name": "timestamp", "type": "TIMESTAMP", "description": "Timestamp" } + ] + ) + + update_table_schema_customers = BigQueryUpdateTableSchemaOperator( + task_id="update_table_schema_purchases", + project_id=DTL_L0_PRJ, + dataset_id=DTL_L0_BQ_DATASET, + table_id="purchases", + impersonation_chain=[TRF_SA_BQ], + include_policy_tags=True, + schema_fields_updates=[ + { "mode": "REQUIRED", "name": "id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "customer_id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "item", "type": "STRING", "description": "Item Name" }, + { "mode": "REQUIRED", "name": "price", "type": "FLOAT", "description": "Item Price" }, + { "mode": "REQUIRED", "name": "timestamp", "type": "TIMESTAMP", "description": "Timestamp" } + ] + ) + + update_table_schema_customer_purchase_l1 = BigQueryUpdateTableSchemaOperator( + task_id="update_table_schema_customer_purchase_l1", + project_id=DTL_L1_PRJ, + dataset_id=DTL_L1_BQ_DATASET, + table_id="customer_purchase", + impersonation_chain=[TRF_SA_BQ], + include_policy_tags=True, + schema_fields_updates=[ + { "mode": "REQUIRED", "name": "customer_id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "purchase_id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "name", "type": "STRING", "description": "Name", "policyTags": { "names": [DATA_CAT_TAGS.get('2_Private', None)]}}, + { "mode": "REQUIRED", "name": "surname", "type": "STRING", "description": "Surname", "policyTags": { "names": [DATA_CAT_TAGS.get('2_Private', None)]} }, + { "mode": "REQUIRED", "name": "item", "type": "STRING", "description": "Item Name" }, + { "mode": "REQUIRED", "name": "price", "type": "FLOAT", "description": "Item Price" }, + { "mode": "REQUIRED", "name": "timestamp", "type": "TIMESTAMP", "description": "Timestamp" } + ] + ) + + update_table_schema_customer_purchase_l2 = BigQueryUpdateTableSchemaOperator( + task_id="update_table_schema_customer_purchase_l2", + project_id=DTL_L2_PRJ, + dataset_id=DTL_L2_BQ_DATASET, + table_id="customer_purchase", + impersonation_chain=[TRF_SA_BQ], + include_policy_tags=True, + schema_fields_updates=[ + { "mode": "REQUIRED", "name": "customer_id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "purchase_id", "type": "INTEGER", "description": "ID" }, + { "mode": "REQUIRED", "name": "name", "type": "STRING", "description": "Name", "policyTags": { "names": [DATA_CAT_TAGS.get('2_Private', None)]}}, + { "mode": "REQUIRED", "name": "surname", "type": "STRING", "description": "Surname", "policyTags": { "names": [DATA_CAT_TAGS.get('2_Private', None)]} }, + { "mode": "REQUIRED", "name": "item", "type": "STRING", "description": "Item Name" }, + { "mode": "REQUIRED", "name": "price", "type": "FLOAT", "description": "Item Price" }, + { "mode": "REQUIRED", "name": "timestamp", "type": "TIMESTAMP", "description": "Timestamp" } + ] + ) + + customers_import = DataflowTemplatedJobStartOperator( + task_id="dataflow_customers_import", + template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", + project_id=LOD_PRJ, + location=DF_REGION, + parameters={ + "javascriptTextTransformFunctionName": "transform", + "JSONPath": ORC_GCS + "/customers_schema.json", + "javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js", + "inputFilePattern": LND_GCS + "/customers.csv", + "outputTable": DTL_L0_PRJ + ":"+DTL_L0_BQ_DATASET+".customers", + "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", + }, + ) + + purchases_import = DataflowTemplatedJobStartOperator( + task_id="dataflow_purchases_import", + template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", + project_id=LOD_PRJ, + location=DF_REGION, + parameters={ + "javascriptTextTransformFunctionName": "transform", + "JSONPath": ORC_GCS + "/purchases_schema.json", + "javascriptTextTransformGcsPath": ORC_GCS + "/purchases_udf.js", + "inputFilePattern": LND_GCS + "/purchases.csv", + "outputTable": DTL_L0_PRJ + ":"+DTL_L0_BQ_DATASET+".purchases", + "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", + }, + ) + + join_customer_purchase = BigQueryInsertJobOperator( + task_id='bq_join_customer_purchase', + gcp_conn_id='bigquery_default', + project_id=TRF_PRJ, + location=BQ_LOCATION, + configuration={ + 'jobType':'QUERY', + 'query':{ + 'query':"""SELECT + c.id as customer_id, + p.id as purchase_id, + c.name as name, + c.surname as surname, + p.item as item, + p.price as price, + p.timestamp as timestamp + FROM `{dtl_0_prj}.{dtl_0_dataset}.customers` c + JOIN `{dtl_0_prj}.{dtl_0_dataset}.purchases` p ON c.id = p.customer_id + """.format(dtl_0_prj=DTL_L0_PRJ, dtl_0_dataset=DTL_L0_BQ_DATASET, ), + 'destinationTable':{ + 'projectId': DTL_L1_PRJ, + 'datasetId': DTL_L1_BQ_DATASET, + 'tableId': 'customer_purchase' + }, + 'writeDisposition':'WRITE_TRUNCATE', + "useLegacySql": False + } + }, + impersonation_chain=[TRF_SA_BQ] + ) + + l2_customer_purchase = BigQueryInsertJobOperator( + task_id='bq_l2_customer_purchase', + gcp_conn_id='bigquery_default', + project_id=TRF_PRJ, + location=BQ_LOCATION, + configuration={ + 'jobType':'QUERY', + 'query':{ + 'query':"""SELECT + customer_id, + purchase_id, + name, + surname, + item, + price, + timestamp + FROM `{dtl_1_prj}.{dtl_1_dataset}.customer_purchase` + """.format(dtl_1_prj=DTL_L1_PRJ, dtl_1_dataset=DTL_L1_BQ_DATASET, ), + 'destinationTable':{ + 'projectId': DTL_L2_PRJ, + 'datasetId': DTL_L2_BQ_DATASET, + 'tableId': 'customer_purchase' + }, + 'writeDisposition':'WRITE_TRUNCATE', + "useLegacySql": False + } + }, + impersonation_chain=[TRF_SA_BQ] + ) + start >> upsert_table >> update_schema_table >> [customers_import, purchases_import] >> join_customer_purchase >> l2_customer_purchase >> end diff --git a/examples/data-solutions/data-platform-foundations/demo/delete_table.py b/examples/data-solutions/data-platform-foundations/demo/delete_table.py new file mode 100644 index 00000000..a2585a68 --- /dev/null +++ b/examples/data-solutions/data-platform-foundations/demo/delete_table.py @@ -0,0 +1,146 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -------------------------------------------------------------------------------- +# Load The Dependencies +# -------------------------------------------------------------------------------- + +import csv +import datetime +import io +import json +import logging +import os + +from airflow import models +from airflow.providers.google.cloud.operators.dataflow import DataflowTemplatedJobStartOperator +from airflow.operators import dummy +from airflow.providers.google.cloud.operators.bigquery import BigQueryDeleteTableOperator +from airflow.utils.task_group import TaskGroup + +# -------------------------------------------------------------------------------- +# Set variables - Needed for the DEMO +# -------------------------------------------------------------------------------- +BQ_LOCATION = os.environ.get("BQ_LOCATION") +DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS")) +DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ") +DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET") +DTL_L0_GCS = os.environ.get("DTL_L0_GCS") +DTL_L1_PRJ = os.environ.get("DTL_L1_PRJ") +DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET") +DTL_L1_GCS = os.environ.get("DTL_L1_GCS") +DTL_L2_PRJ = os.environ.get("DTL_L2_PRJ") +DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET") +DTL_L2_GCS = os.environ.get("DTL_L2_GCS") +DTL_PLG_PRJ = os.environ.get("DTL_PLG_PRJ") +DTL_PLG_BQ_DATASET = os.environ.get("DTL_PLG_BQ_DATASET") +DTL_PLG_GCS = os.environ.get("DTL_PLG_GCS") +GCP_REGION = os.environ.get("GCP_REGION") +LND_PRJ = os.environ.get("LND_PRJ") +LND_BQ = os.environ.get("LND_BQ") +LND_GCS = os.environ.get("LND_GCS") +LND_PS = os.environ.get("LND_PS") +LOD_PRJ = os.environ.get("LOD_PRJ") +LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING") +LOD_NET_VPC = os.environ.get("LOD_NET_VPC") +LOD_NET_SUBNET = os.environ.get("LOD_NET_SUBNET") +LOD_SA_DF = os.environ.get("LOD_SA_DF") +ORC_PRJ = os.environ.get("ORC_PRJ") +ORC_GCS = os.environ.get("ORC_GCS") +TRF_PRJ = os.environ.get("TRF_PRJ") +TRF_GCS_STAGING = os.environ.get("TRF_GCS_STAGING") +TRF_NET_VPC = os.environ.get("TRF_NET_VPC") +TRF_NET_SUBNET = os.environ.get("TRF_NET_SUBNET") +TRF_SA_DF = os.environ.get("TRF_SA_DF") +TRF_SA_BQ = os.environ.get("TRF_SA_BQ") +DF_KMS_KEY = os.environ.get("DF_KMS_KEY", "") +DF_REGION = os.environ.get("GCP_REGION") +DF_ZONE = os.environ.get("GCP_REGION") + "-b" + +# -------------------------------------------------------------------------------- +# Set default arguments +# -------------------------------------------------------------------------------- + +# If you are running Airflow in more than one time zone +# see https://airflow.apache.org/docs/apache-airflow/stable/timezone.html +# for best practices +yesterday = datetime.datetime.now() - datetime.timedelta(days=1) + +default_args = { + 'owner': 'airflow', + 'start_date': yesterday, + 'depends_on_past': False, + 'email': [''], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': datetime.timedelta(minutes=5), + 'dataflow_default_options': { + 'location': DF_REGION, + 'zone': DF_ZONE, + 'stagingLocation': LOD_GCS_STAGING, + 'tempLocation': LOD_GCS_STAGING + "/tmp", + 'serviceAccountEmail': LOD_SA_DF, + 'subnetwork': LOD_NET_SUBNET, + 'ipConfiguration': "WORKER_IP_PRIVATE", + 'kmsKeyName' : DF_KMS_KEY + }, +} + +# -------------------------------------------------------------------------------- +# Main DAG +# -------------------------------------------------------------------------------- + +with models.DAG( + 'delete_tables_dag', + default_args=default_args, + schedule_interval=None) as dag: + start = dummy.DummyOperator( + task_id='start', + trigger_rule='all_success' + ) + + end = dummy.DummyOperator( + task_id='end', + trigger_rule='all_success' + ) + + # Bigquery Tables deleted here for demo porpuse. + # Consider a dedicated pipeline or tool for a real life scenario. + with TaskGroup('delete_table') as delte_table: + delete_table_customers = BigQueryDeleteTableOperator( + task_id="delete_table_customers", + deletion_dataset_table=DTL_L0_PRJ+"."+DTL_L0_BQ_DATASET+".customers", + impersonation_chain=[TRF_SA_DF] + ) + + delete_table_purchases = BigQueryDeleteTableOperator( + task_id="delete_table_purchases", + deletion_dataset_table=DTL_L0_PRJ+"."+DTL_L0_BQ_DATASET+".purchases", + impersonation_chain=[TRF_SA_DF] + ) + + delete_table_customer_purchase_l1 = BigQueryDeleteTableOperator( + task_id="delete_table_customer_purchase_l1", + deletion_dataset_table=DTL_L1_PRJ+"."+DTL_L1_BQ_DATASET+".customer_purchase", + impersonation_chain=[TRF_SA_DF] + ) + + delete_table_customer_purchase_l2 = BigQueryDeleteTableOperator( + task_id="delete_table_customer_purchase_l2", + deletion_dataset_table=DTL_L2_PRJ+"."+DTL_L2_BQ_DATASET+".customer_purchase", + impersonation_chain=[TRF_SA_DF] + ) + + start >> delte_table >> end diff --git a/examples/data-solutions/data-platform-foundations/outputs.tf b/examples/data-solutions/data-platform-foundations/outputs.tf index e5a2de3e..32e98fc6 100644 --- a/examples/data-solutions/data-platform-foundations/outputs.tf +++ b/examples/data-solutions/data-platform-foundations/outputs.tf @@ -98,7 +98,7 @@ output "demo_commands" { 03 = "gsutil -i ${module.orch-sa-cmp-0.email} cp demo/*.py ${google_composer_environment.orch-cmp-0.config[0].dag_gcs_prefix}/" 04 = "Open ${google_composer_environment.orch-cmp-0.config.0.airflow_uri} and run uploaded DAG." 05 = <+ additive, conditional. | members | roles | |---|---| +|gcp-data-analysts
group|[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) | |gcp-data-engineers
group|[roles/dlp.estimatesAdmin](https://cloud.google.com/iam/docs/understanding-roles#dlp.estimatesAdmin)
[roles/dlp.reader](https://cloud.google.com/iam/docs/understanding-roles#dlp.reader)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | -|gcp-data-security
group|[roles/dlp.admin](https://cloud.google.com/iam/docs/understanding-roles#dlp.admin) | -|dev-data-load-df-0
serviceAccount|[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | -|dev-data-trf-df-0
serviceAccount|[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | +|gcp-data-security
group|[roles/datacatalog.admin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.admin)
[roles/dlp.admin](https://cloud.google.com/iam/docs/understanding-roles#dlp.admin) | +|dev-data-load-df-0
serviceAccount|[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | +|dev-data-trf-bq-0
serviceAccount|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) | +|dev-data-trf-df-0
serviceAccount|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | ## Project dev-data-dtl-0-0 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | -|dev-data-load-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) | -|dev-data-trf-bq-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) | -|dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|dev-data-load-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) | +|dev-data-trf-bq-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | +|dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) | ## Project dev-data-dtl-1-0 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | -|dev-data-trf-bq-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | -|dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|dev-data-load-df-0
serviceAccount|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | +|dev-data-trf-bq-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | +|dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | ## Project dev-data-dtl-2-0 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | -|dev-data-trf-bq-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | -|dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|dev-data-load-df-0
serviceAccount|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | +|dev-data-trf-bq-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | +|dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | ## Project dev-data-dtl-plg-0 | members | roles | |---|---| -|gcp-data-analysts
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|gcp-data-analysts
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer)
[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| ## Project dev-data-lnd-0 @@ -62,37 +70,40 @@ Legend: + additive, conditional. | members | roles | |---|---| |gcp-data-engineers
group|[roles/compute.viewer](https://cloud.google.com/iam/docs/understanding-roles#compute.viewer)
[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin)
[roles/dataflow.developer](https://cloud.google.com/iam/docs/understanding-roles#dataflow.developer)
[roles/viewer](https://cloud.google.com/iam/docs/understanding-roles#viewer) | +|SERVICE_IDENTITY_dataflow-service-producer-prod
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| |dev-data-load-df-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin)
[roles/dataflow.worker](https://cloud.google.com/iam/docs/understanding-roles#dataflow.worker)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | |dev-data-orc-cmp-0
serviceAccount|[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) | -|service-426128559612
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | ## Project dev-data-orc-0 | members | roles | |---|---| |gcp-data-engineers
group|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/cloudbuild.builds.editor](https://cloud.google.com/iam/docs/understanding-roles#cloudbuild.builds.editor)
[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin)
[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_cloudcomposer-accounts
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| |dev-data-load-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | |dev-data-orc-cmp-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/composer.worker](https://cloud.google.com/iam/docs/understanding-roles#composer.worker)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | |dev-data-trf-df-0
serviceAccount|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) | -|service-36960036774
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | ## Project dev-data-trf-0 | members | roles | |---|---| |gcp-data-engineers
group|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) | +|SERVICE_IDENTITY_dataflow-service-producer-prod
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| |dev-data-orc-cmp-0
serviceAccount|[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) | |dev-data-trf-bq-0
serviceAccount|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) | |dev-data-trf-df-0
serviceAccount|[roles/dataflow.worker](https://cloud.google.com/iam/docs/understanding-roles#dataflow.worker)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | -|service-883871192228
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | ## Project dev-net-spoke-0 | members | roles | |---|---| -|36960036774
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +| +|PROJECT_CLOUD_SERVICES
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +| +|SERVICE_IDENTITY_cloudcomposer-accounts
serviceAccount|[roles/composer.sharedVpcAgent](https://cloud.google.com/iam/docs/understanding-roles#composer.sharedVpcAgent) +| +|SERVICE_IDENTITY_container-engine-robot
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +
[roles/container.hostServiceAgentUser](https://cloud.google.com/iam/docs/understanding-roles#container.hostServiceAgentUser) +| +|SERVICE_IDENTITY_dataflow-service-producer-prod
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +
[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +
[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +
[roles/container.hostServiceAgentUser](https://cloud.google.com/iam/docs/understanding-roles#container.hostServiceAgentUser) +| |dev-data-load-df-0
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +| |dev-data-trf-df-0
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +| -|service-36960036774
serviceAccount|[roles/composer.sharedVpcAgent](https://cloud.google.com/iam/docs/understanding-roles#composer.sharedVpcAgent) +
[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +
[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +
[roles/container.hostServiceAgentUser](https://cloud.google.com/iam/docs/understanding-roles#container.hostServiceAgentUser) +
[roles/container.hostServiceAgentUser](https://cloud.google.com/iam/docs/understanding-roles#container.hostServiceAgentUser) +| -|service-426128559612
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +| -|service-883871192228
serviceAccount|[roles/compute.networkUser](https://cloud.google.com/iam/docs/understanding-roles#compute.networkUser) +| diff --git a/fast/stages/03-data-platform/dev/README.md b/fast/stages/03-data-platform/dev/README.md index 19adb068..5ecc2ad5 100644 --- a/fast/stages/03-data-platform/dev/README.md +++ b/fast/stages/03-data-platform/dev/README.md @@ -50,6 +50,19 @@ Cloud KMS crypto keys can be configured wither from the [FAST security stage](.. To configure the use of Cloud KMS on resources, you have to specify the key id on the `service_encryption_keys` variable. Key locations should match resource locations. +## Data Catalog + +[Data Catalog](https://cloud.google.com/data-catalog) helps you to document your data entry at scale. Data Catalog relies on [tags](https://cloud.google.com/data-catalog/docs/tags-and-tag-templates#tags) and [tag template](https://cloud.google.com/data-catalog/docs/tags-and-tag-templates#tag-templates) to manage metadata for all data entries in a unified and centralized service. To implement [column-level security](https://cloud.google.com/bigquery/docs/column-level-security-intro) on BigQuery, we suggest to use `Tags` and `Tag templates`. + +The default configuration will implement 3 tags: + - `3_Confidential`: policy tag for columns that include very sensitive information, such as credit card numbers. + - `2_Private`: policy tag for columns that include sensitive personal identifiable information (PII) information, such as a person's first name. + - `1_Sensitive`: policy tag for columns that include data that cannot be made public, such as the credit limit. + +Anything that is not tagged is available to all users who have access to the data warehouse. + +You can configure your tags and roles associated by configuring the `data_catalog_tags` variable. We suggest useing the "[Best practices for using policy tags in BigQuery](https://cloud.google.com/bigquery/docs/best-practices-policy-tags)" article as a guide to designing your tags structure and access pattern. By default, no groups has access to tagged data. + ### VPC-SC As is often the case in real-world configurations, [VPC-SC](https://cloud.google.com/vpc-service-controls) is needed to mitigate data exfiltration. VPC-SC can be configured from the [FAST security stage](../../02-security). This step is optional, but highly recomended, and depends on customer policies and security best practices. @@ -116,6 +129,12 @@ terraform init terraform apply ``` +## Demo pipeline + +The application layer is out of scope of this script. As a demo purpuse only, several Cloud Composer DAGs are provided. Demos will import data from the `landing` area to the `DataLake L2` dataset suing different features. + +You can find examples in the `[demo](../../../../examples/data-solutions/data-platform-foundations/demo)` folder. + @@ -132,20 +151,21 @@ terraform apply | name | description | type | required | default | producer | |---|---|:---:|:---:|:---:|:---:| | [billing_account](variables.tf#L17) | Billing account id and organization id ('nnnnnnnn' or null). | object({…}) | ✓ | | 00-globals | -| [folder_ids](variables.tf#L45) | Folder to be used for the networking resources in folders/nnnn format. | object({…}) | ✓ | | 01-resman | -| [host_project_ids](variables.tf#L63) | Shared VPC project ids. | object({…}) | ✓ | | 02-networking | -| [organization](variables.tf#L89) | Organization details. | object({…}) | ✓ | | 00-globals | -| [prefix](variables.tf#L105) | Unique prefix used for resource names. Not used for projects if 'project_create' is null. | string | ✓ | | 00-globals | +| [folder_ids](variables.tf#L56) | Folder to be used for the networking resources in folders/nnnn format. | object({…}) | ✓ | | 01-resman | +| [host_project_ids](variables.tf#L74) | Shared VPC project ids. | object({…}) | ✓ | | 02-networking | +| [organization](variables.tf#L100) | Organization details. | object({…}) | ✓ | | 00-globals | +| [prefix](variables.tf#L116) | Unique prefix used for resource names. Not used for projects if 'project_create' is null. | string | ✓ | | 00-globals | | [composer_config](variables.tf#L26) | | object({…}) | | {…} | | -| [data_force_destroy](variables.tf#L39) | Flag to set 'force_destroy' on data services like BigQery or Cloud Storage. | bool | | false | | -| [groups](variables.tf#L53) | Groups. | map(string) | | {…} | | -| [network_config_composer](variables.tf#L71) | Network configurations to use for Composer. | object({…}) | | {…} | | -| [outputs_location](variables.tf#L99) | Path where providers, tfvars files, and lists for the following stages are written. Leave empty to disable. | string | | null | | -| [project_services](variables.tf#L111) | List of core services enabled on all projects. | list(string) | | […] | | -| [region](variables.tf#L122) | Region used for regional resources. | string | | "europe-west1" | | -| [service_encryption_keys](variables.tf#L128) | Cloud KMS to use to encrypt different services. Key location should match service region. | object({…}) | | null | | -| [subnet_self_links](variables.tf#L140) | Shared VPC subnet self links. | object({…}) | | null | 02-networking | -| [vpc_self_links](variables.tf#L149) | Shared VPC self links. | object({…}) | | null | 02-networking | +| [data_catalog_tags](variables.tf#L39) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | map(map(list(string))) | | {…} | | +| [data_force_destroy](variables.tf#L50) | Flag to set 'force_destroy' on data services like BigQery or Cloud Storage. | bool | | false | | +| [groups](variables.tf#L64) | Groups. | map(string) | | {…} | | +| [network_config_composer](variables.tf#L82) | Network configurations to use for Composer. | object({…}) | | {…} | | +| [outputs_location](variables.tf#L110) | Path where providers, tfvars files, and lists for the following stages are written. Leave empty to disable. | string | | null | | +| [project_services](variables.tf#L122) | List of core services enabled on all projects. | list(string) | | […] | | +| [region](variables.tf#L133) | Region used for regional resources. | string | | "europe-west1" | | +| [service_encryption_keys](variables.tf#L139) | Cloud KMS to use to encrypt different services. Key location should match service region. | object({…}) | | null | | +| [subnet_self_links](variables.tf#L151) | Shared VPC subnet self links. | object({…}) | | null | 02-networking | +| [vpc_self_links](variables.tf#L160) | Shared VPC self links. | object({…}) | | null | 02-networking | ## Outputs diff --git a/fast/stages/03-data-platform/dev/main.tf b/fast/stages/03-data-platform/dev/main.tf index c10380da..536e1873 100644 --- a/fast/stages/03-data-platform/dev/main.tf +++ b/fast/stages/03-data-platform/dev/main.tf @@ -21,6 +21,7 @@ module "data-platform" { billing_account_id = var.billing_account.id composer_config = var.composer_config data_force_destroy = var.data_force_destroy + data_catalog_tags = var.data_catalog_tags folder_id = var.folder_ids.data-platform groups = var.groups network_config = { diff --git a/fast/stages/03-data-platform/dev/variables.tf b/fast/stages/03-data-platform/dev/variables.tf index 1f65cf77..3b1645e4 100644 --- a/fast/stages/03-data-platform/dev/variables.tf +++ b/fast/stages/03-data-platform/dev/variables.tf @@ -36,6 +36,17 @@ variable "composer_config" { } } +variable "data_catalog_tags" { + description = "List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format." + type = map(map(list(string))) + nullable = false + default = { + "3_Confidential" = null + "2_Private" = null + "1_Sensitive" = null + } +} + variable "data_force_destroy" { description = "Flag to set 'force_destroy' on data services like BigQery or Cloud Storage." type = bool diff --git a/tests/examples/data_solutions/data_platform_foundations/test_plan.py b/tests/examples/data_solutions/data_platform_foundations/test_plan.py index 4857bf9f..e5db6ffc 100644 --- a/tests/examples/data_solutions/data_platform_foundations/test_plan.py +++ b/tests/examples/data_solutions/data_platform_foundations/test_plan.py @@ -23,5 +23,5 @@ FIXTURES_DIR = os.path.join(os.path.dirname(__file__), 'fixture') def test_resources(e2e_plan_runner): "Test that plan works and the numbers of resources is as expected." modules, resources = e2e_plan_runner(FIXTURES_DIR) - assert len(modules) == 40 - assert len(resources) == 296 + assert len(modules) == 41 + assert len(resources) == 313 diff --git a/tools/state_iam.py b/tools/state_iam.py index 7817a7ee..42f9f76e 100755 --- a/tools/state_iam.py +++ b/tools/state_iam.py @@ -65,10 +65,22 @@ def get_bindings(resources, prefix=None, folders=None): member_type, _, member_id = member.partition(':') if member_type == 'user': continue - member_id = member_id.rpartition('@')[0] + member_id, member_domain = member_id.split('@', 1) + # Handle Cloud Services Service Account + if member_domain == 'cloudservices.gserviceaccount.com': + member_id = "PROJECT_CLOUD_SERVICES" + # Handle Cloud Service Identity Service Acocunt + if re.match("^service-\d{8}", member_id): + member_id = "SERVICE_IDENTITY_" + member_domain.split(".", 1)[0] + # Handle BQ Cloud Service Identity Service Acocunt + if re.match("^bq-\d{8}", member_id): + member_id = "IDENTITY_" + member_domain.split(".", 1)[0] + resource_type_output = "Service Identity - " + resource_type + else: + resource_type_output = resource_type if prefix and member_id.startswith(prefix): member_id = member_id[len(prefix) + 1:] - yield Binding(authoritative, resource_type, resource_id, role, + yield Binding(authoritative, resource_type_output, resource_id, role, member_type, member_id, conditions)