Merge pull request #637 from GoogleCloudPlatform/lcaggio/dp-updatenaming

Data Platform: Update Naming Convention
This commit is contained in:
lcaggio 2022-04-22 11:24:39 +02:00 committed by GitHub
commit f8b675b65d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 12471 additions and 323 deletions

View File

@ -12,20 +12,20 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# tfdoc:file:description land project and resources. # tfdoc:file:description drop off project and resources.
locals { locals {
land_orch_service_accounts = [ drop_orch_service_accounts = [
module.load-sa-df-0.iam_email, module.orch-sa-cmp-0.iam_email module.load-sa-df-0.iam_email, module.orch-sa-cmp-0.iam_email
] ]
} }
module "land-project" { module "drop-project" {
source = "../../../modules/project" source = "../../../modules/project"
parent = var.folder_id parent = var.folder_id
billing_account = var.billing_account_id billing_account = var.billing_account_id
prefix = var.prefix prefix = var.prefix
name = "lnd${local.project_suffix}" name = "drp${local.project_suffix}"
group_iam = { group_iam = {
(local.groups.data-engineers) = [ (local.groups.data-engineers) = [
"roles/bigquery.dataEditor", "roles/bigquery.dataEditor",
@ -34,14 +34,14 @@ module "land-project" {
] ]
} }
iam = { iam = {
"roles/bigquery.dataEditor" = [module.land-sa-bq-0.iam_email] "roles/bigquery.dataEditor" = [module.drop-sa-bq-0.iam_email]
"roles/bigquery.user" = [module.load-sa-df-0.iam_email] "roles/bigquery.user" = [module.load-sa-df-0.iam_email]
"roles/pubsub.publisher" = [module.land-sa-ps-0.iam_email] "roles/pubsub.publisher" = [module.drop-sa-ps-0.iam_email]
"roles/pubsub.subscriber" = concat( "roles/pubsub.subscriber" = concat(
local.land_orch_service_accounts, [module.load-sa-df-0.iam_email] local.drop_orch_service_accounts, [module.load-sa-df-0.iam_email]
) )
"roles/storage.objectAdmin" = [module.load-sa-df-0.iam_email] "roles/storage.objectAdmin" = [module.load-sa-df-0.iam_email]
"roles/storage.objectCreator" = [module.land-sa-cs-0.iam_email] "roles/storage.objectCreator" = [module.drop-sa-cs-0.iam_email]
"roles/storage.objectViewer" = [module.orch-sa-cmp-0.iam_email] "roles/storage.objectViewer" = [module.orch-sa-cmp-0.iam_email]
"roles/storage.admin" = [module.load-sa-df-0.iam_email] "roles/storage.admin" = [module.load-sa-df-0.iam_email]
} }
@ -63,12 +63,12 @@ module "land-project" {
# Cloud Storage # Cloud Storage
module "land-sa-cs-0" { module "drop-sa-cs-0" {
source = "../../../modules/iam-service-account" source = "../../../modules/iam-service-account"
project_id = module.land-project.project_id project_id = module.drop-project.project_id
prefix = var.prefix prefix = var.prefix
name = "lnd-cs-0" name = "drp-cs-0"
display_name = "Data platform GCS landing service account." display_name = "Data platform GCS drop off service account."
iam = { iam = {
"roles/iam.serviceAccountTokenCreator" = [ "roles/iam.serviceAccountTokenCreator" = [
local.groups_iam.data-engineers local.groups_iam.data-engineers
@ -76,11 +76,11 @@ module "land-sa-cs-0" {
} }
} }
module "land-cs-0" { module "drop-cs-0" {
source = "../../../modules/gcs" source = "../../../modules/gcs"
project_id = module.land-project.project_id project_id = module.drop-project.project_id
prefix = var.prefix prefix = var.prefix
name = "lnd-cs-0" name = "drp-cs-0"
location = var.location location = var.location
storage_class = "MULTI_REGIONAL" storage_class = "MULTI_REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null) encryption_key = try(local.service_encryption_keys.storage, null)
@ -93,12 +93,12 @@ module "land-cs-0" {
# PubSub # PubSub
module "land-sa-ps-0" { module "drop-sa-ps-0" {
source = "../../../modules/iam-service-account" source = "../../../modules/iam-service-account"
project_id = module.land-project.project_id project_id = module.drop-project.project_id
prefix = var.prefix prefix = var.prefix
name = "lnd-ps-0" name = "drp-ps-0"
display_name = "Data platform PubSub landing service account" display_name = "Data platform PubSub drop off service account"
iam = { iam = {
"roles/iam.serviceAccountTokenCreator" = [ "roles/iam.serviceAccountTokenCreator" = [
local.groups_iam.data-engineers local.groups_iam.data-engineers
@ -106,30 +106,30 @@ module "land-sa-ps-0" {
} }
} }
module "land-ps-0" { module "drop-ps-0" {
source = "../../../modules/pubsub" source = "../../../modules/pubsub"
project_id = module.land-project.project_id project_id = module.drop-project.project_id
name = "${var.prefix}-lnd-ps-0" name = "${var.prefix}-drp-ps-0"
kms_key = try(local.service_encryption_keys.pubsub, null) kms_key = try(local.service_encryption_keys.pubsub, null)
} }
# BigQuery # BigQuery
module "land-sa-bq-0" { module "drop-sa-bq-0" {
source = "../../../modules/iam-service-account" source = "../../../modules/iam-service-account"
project_id = module.land-project.project_id project_id = module.drop-project.project_id
prefix = var.prefix prefix = var.prefix
name = "lnd-bq-0" name = "drp-bq-0"
display_name = "Data platform BigQuery landing service account" display_name = "Data platform BigQuery drop off service account"
iam = { iam = {
"roles/iam.serviceAccountTokenCreator" = [local.groups_iam.data-engineers] "roles/iam.serviceAccountTokenCreator" = [local.groups_iam.data-engineers]
} }
} }
module "land-bq-0" { module "drop-bq-0" {
source = "../../../modules/bigquery-dataset" source = "../../../modules/bigquery-dataset"
project_id = module.land-project.project_id project_id = module.drop-project.project_id
id = "${replace(var.prefix, "-", "_")}lnd_bq_0" id = "${replace(var.prefix, "-", "_")}drp_bq_0"
location = var.location location = var.location
encryption_key = try(local.service_encryption_keys.bq, null) encryption_key = try(local.service_encryption_keys.bq, null)
} }

View File

@ -66,39 +66,39 @@ resource "google_composer_environment" "orch-cmp-0" {
image_version = var.composer_config.airflow_version image_version = var.composer_config.airflow_version
env_variables = merge( env_variables = merge(
var.composer_config.env_variables, { var.composer_config.env_variables, {
BQ_LOCATION = var.location BQ_LOCATION = var.location
DATA_CAT_TAGS = try(jsonencode(module.common-datacatalog.tags), "{}") DATA_CAT_TAGS = try(jsonencode(module.common-datacatalog.tags), "{}")
DF_KMS_KEY = try(var.service_encryption_keys.dataflow, "") DF_KMS_KEY = try(var.service_encryption_keys.dataflow, "")
DTL_L0_PRJ = module.lake-0-project.project_id DRP_PRJ = module.drop-project.project_id
DTL_L0_BQ_DATASET = module.lake-0-bq-0.dataset_id DRP_BQ = module.drop-bq-0.dataset_id
DTL_L0_GCS = module.lake-0-cs-0.url DRP_GCS = module.drop-cs-0.url
DTL_L1_PRJ = module.lake-1-project.project_id DRP_PS = module.drop-ps-0.id
DTL_L1_BQ_DATASET = module.lake-1-bq-0.dataset_id DWH_LAND_PRJ = module.dwh-lnd-project.project_id
DTL_L1_GCS = module.lake-1-cs-0.url DWH_LAND_BQ_DATASET = module.dwh-lnd-bq-0.dataset_id
DTL_L2_PRJ = module.lake-2-project.project_id DWH_LAND_GCS = module.dwh-lnd-cs-0.url
DTL_L2_BQ_DATASET = module.lake-2-bq-0.dataset_id DWH_CURATED_PRJ = module.dwh-cur-project.project_id
DTL_L2_GCS = module.lake-2-cs-0.url DWH_CURATED_BQ_DATASET = module.dwh-cur-bq-0.dataset_id
DTL_PLG_PRJ = module.lake-plg-project.project_id DWH_CURATED_GCS = module.dwh-cur-cs-0.url
DTL_PLG_BQ_DATASET = module.lake-plg-bq-0.dataset_id DWH_CONFIDENTIAL_PRJ = module.dwh-conf-project.project_id
DTL_PLG_GCS = module.lake-plg-cs-0.url DWH_CONFIDENTIAL_BQ_DATASET = module.dwh-conf-bq-0.dataset_id
GCP_REGION = var.region DWH_CONFIDENTIAL_GCS = module.dwh-conf-cs-0.url
LND_PRJ = module.land-project.project_id DWH_PLG_PRJ = module.dwh-plg-project.project_id
LND_BQ = module.land-bq-0.dataset_id DWH_PLG_BQ_DATASET = module.dwh-plg-bq-0.dataset_id
LND_GCS = module.land-cs-0.url DWH_PLG_GCS = module.dwh-plg-cs-0.url
LND_PS = module.land-ps-0.id GCP_REGION = var.region
LOD_PRJ = module.load-project.project_id LOD_PRJ = module.load-project.project_id
LOD_GCS_STAGING = module.load-cs-df-0.url LOD_GCS_STAGING = module.load-cs-df-0.url
LOD_NET_VPC = local.load_vpc LOD_NET_VPC = local.load_vpc
LOD_NET_SUBNET = local.load_subnet LOD_NET_SUBNET = local.load_subnet
LOD_SA_DF = module.load-sa-df-0.email LOD_SA_DF = module.load-sa-df-0.email
ORC_PRJ = module.orch-project.project_id ORC_PRJ = module.orch-project.project_id
ORC_GCS = module.orch-cs-0.url ORC_GCS = module.orch-cs-0.url
TRF_PRJ = module.transf-project.project_id TRF_PRJ = module.transf-project.project_id
TRF_GCS_STAGING = module.transf-cs-df-0.url TRF_GCS_STAGING = module.transf-cs-df-0.url
TRF_NET_VPC = local.transf_vpc TRF_NET_VPC = local.transf_vpc
TRF_NET_SUBNET = local.transf_subnet TRF_NET_SUBNET = local.transf_subnet
TRF_SA_DF = module.transf-sa-df-0.email TRF_SA_DF = module.transf-sa-df-0.email
TRF_SA_BQ = module.transf-sa-bq-0.email TRF_SA_BQ = module.transf-sa-bq-0.email
} }
) )
} }

View File

@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# tfdoc:file:description Datalake projects. # tfdoc:file:description Data Warehouse projects.
locals { locals {
lake_group_iam = { dwh_group_iam = {
(local.groups.data-engineers) = [ (local.groups.data-engineers) = [
"roles/bigquery.dataEditor", "roles/bigquery.dataEditor",
"roles/storage.admin", "roles/storage.admin",
@ -30,7 +30,7 @@ locals {
"roles/storage.objectViewer", "roles/storage.objectViewer",
] ]
} }
lake_plg_group_iam = { dwh_plg_group_iam = {
(local.groups.data-engineers) = [ (local.groups.data-engineers) = [
"roles/bigquery.dataEditor", "roles/bigquery.dataEditor",
"roles/storage.admin", "roles/storage.admin",
@ -45,7 +45,7 @@ locals {
"roles/storage.objectAdmin", "roles/storage.objectAdmin",
] ]
} }
lake_0_iam = { dwh_lnd_iam = {
"roles/bigquery.dataOwner" = [ "roles/bigquery.dataOwner" = [
module.load-sa-df-0.iam_email, module.load-sa-df-0.iam_email,
module.transf-sa-df-0.iam_email, module.transf-sa-df-0.iam_email,
@ -61,7 +61,7 @@ locals {
module.load-sa-df-0.iam_email, module.load-sa-df-0.iam_email,
] ]
} }
lake_iam = { dwh_iam = {
"roles/bigquery.dataOwner" = [ "roles/bigquery.dataOwner" = [
module.transf-sa-df-0.iam_email, module.transf-sa-df-0.iam_email,
module.transf-sa-bq-0.iam_email, module.transf-sa-bq-0.iam_email,
@ -79,7 +79,7 @@ locals {
module.transf-sa-df-0.iam_email, module.transf-sa-df-0.iam_email,
] ]
} }
lake_services = concat(var.project_services, [ dwh_services = concat(var.project_services, [
"bigquery.googleapis.com", "bigquery.googleapis.com",
"bigqueryreservation.googleapis.com", "bigqueryreservation.googleapis.com",
"bigquerystorage.googleapis.com", "bigquerystorage.googleapis.com",
@ -95,60 +95,60 @@ locals {
# Project # Project
module "lake-0-project" { module "dwh-lnd-project" {
source = "../../../modules/project" source = "../../../modules/project"
parent = var.folder_id parent = var.folder_id
billing_account = var.billing_account_id billing_account = var.billing_account_id
prefix = var.prefix prefix = var.prefix
name = "dtl-0${local.project_suffix}" name = "dwh-lnd${local.project_suffix}"
group_iam = local.lake_group_iam group_iam = local.dwh_group_iam
iam = local.lake_0_iam iam = local.dwh_lnd_iam
services = local.lake_services services = local.dwh_services
service_encryption_key_ids = { service_encryption_key_ids = {
bq = [try(local.service_encryption_keys.bq, null)] bq = [try(local.service_encryption_keys.bq, null)]
storage = [try(local.service_encryption_keys.storage, null)] storage = [try(local.service_encryption_keys.storage, null)]
} }
} }
module "lake-1-project" { module "dwh-cur-project" {
source = "../../../modules/project" source = "../../../modules/project"
parent = var.folder_id parent = var.folder_id
billing_account = var.billing_account_id billing_account = var.billing_account_id
prefix = var.prefix prefix = var.prefix
name = "dtl-1${local.project_suffix}" name = "dwh-cur${local.project_suffix}"
group_iam = local.lake_group_iam group_iam = local.dwh_group_iam
iam = local.lake_iam iam = local.dwh_iam
services = local.lake_services services = local.dwh_services
service_encryption_key_ids = { service_encryption_key_ids = {
bq = [try(local.service_encryption_keys.bq, null)] bq = [try(local.service_encryption_keys.bq, null)]
storage = [try(local.service_encryption_keys.storage, null)] storage = [try(local.service_encryption_keys.storage, null)]
} }
} }
module "lake-2-project" { module "dwh-conf-project" {
source = "../../../modules/project" source = "../../../modules/project"
parent = var.folder_id parent = var.folder_id
billing_account = var.billing_account_id billing_account = var.billing_account_id
prefix = var.prefix prefix = var.prefix
name = "dtl-2${local.project_suffix}" name = "dwh-conf${local.project_suffix}"
group_iam = local.lake_group_iam group_iam = local.dwh_group_iam
iam = local.lake_iam iam = local.dwh_iam
services = local.lake_services services = local.dwh_services
service_encryption_key_ids = { service_encryption_key_ids = {
bq = [try(local.service_encryption_keys.bq, null)] bq = [try(local.service_encryption_keys.bq, null)]
storage = [try(local.service_encryption_keys.storage, null)] storage = [try(local.service_encryption_keys.storage, null)]
} }
} }
module "lake-plg-project" { module "dwh-plg-project" {
source = "../../../modules/project" source = "../../../modules/project"
parent = var.folder_id parent = var.folder_id
billing_account = var.billing_account_id billing_account = var.billing_account_id
prefix = var.prefix prefix = var.prefix
name = "dtl-plg${local.project_suffix}" name = "dwh-plg${local.project_suffix}"
group_iam = local.lake_plg_group_iam group_iam = local.dwh_plg_group_iam
iam = {} iam = {}
services = local.lake_services services = local.dwh_services
service_encryption_key_ids = { service_encryption_key_ids = {
bq = [try(local.service_encryption_keys.bq, null)] bq = [try(local.service_encryption_keys.bq, null)]
storage = [try(local.service_encryption_keys.storage, null)] storage = [try(local.service_encryption_keys.storage, null)]
@ -157,78 +157,78 @@ module "lake-plg-project" {
# Bigquery # Bigquery
module "lake-0-bq-0" { module "dwh-lnd-bq-0" {
source = "../../../modules/bigquery-dataset" source = "../../../modules/bigquery-dataset"
project_id = module.lake-0-project.project_id project_id = module.dwh-lnd-project.project_id
id = "${replace(var.prefix, "-", "_")}_dtl_0_bq_0" id = "${replace(var.prefix, "-", "_")}_dwh_lnd_bq_0"
location = var.location location = var.location
encryption_key = try(local.service_encryption_keys.bq, null) encryption_key = try(local.service_encryption_keys.bq, null)
} }
module "lake-1-bq-0" { module "dwh-cur-bq-0" {
source = "../../../modules/bigquery-dataset" source = "../../../modules/bigquery-dataset"
project_id = module.lake-1-project.project_id project_id = module.dwh-cur-project.project_id
id = "${replace(var.prefix, "-", "_")}_dtl_1_bq_0" id = "${replace(var.prefix, "-", "_")}_dwh_lnd_bq_0"
location = var.location location = var.location
encryption_key = try(local.service_encryption_keys.bq, null) encryption_key = try(local.service_encryption_keys.bq, null)
} }
module "lake-2-bq-0" { module "dwh-conf-bq-0" {
source = "../../../modules/bigquery-dataset" source = "../../../modules/bigquery-dataset"
project_id = module.lake-2-project.project_id project_id = module.dwh-conf-project.project_id
id = "${replace(var.prefix, "-", "_")}_dtl_2_bq_0" id = "${replace(var.prefix, "-", "_")}_dwh_conf_bq_0"
location = var.location location = var.location
encryption_key = try(local.service_encryption_keys.bq, null) encryption_key = try(local.service_encryption_keys.bq, null)
} }
module "lake-plg-bq-0" { module "dwh-plg-bq-0" {
source = "../../../modules/bigquery-dataset" source = "../../../modules/bigquery-dataset"
project_id = module.lake-plg-project.project_id project_id = module.dwh-plg-project.project_id
id = "${replace(var.prefix, "-", "_")}_dtl_plg_bq_0" id = "${replace(var.prefix, "-", "_")}_dwh_plg_bq_0"
location = var.location location = var.location
encryption_key = try(local.service_encryption_keys.bq, null) encryption_key = try(local.service_encryption_keys.bq, null)
} }
# Cloud storage # Cloud storage
module "lake-0-cs-0" { module "dwh-lnd-cs-0" {
source = "../../../modules/gcs" source = "../../../modules/gcs"
project_id = module.lake-0-project.project_id project_id = module.dwh-lnd-project.project_id
prefix = var.prefix prefix = var.prefix
name = "dtl-0-cs-0" name = "dwh-lnd-cs-0"
location = var.location location = var.location
storage_class = "MULTI_REGIONAL" storage_class = "MULTI_REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null) encryption_key = try(local.service_encryption_keys.storage, null)
force_destroy = var.data_force_destroy force_destroy = var.data_force_destroy
} }
module "lake-1-cs-0" { module "dwh-cur-cs-0" {
source = "../../../modules/gcs" source = "../../../modules/gcs"
project_id = module.lake-1-project.project_id project_id = module.dwh-cur-project.project_id
prefix = var.prefix prefix = var.prefix
name = "dtl-1-cs-0" name = "dwh-cur-cs-0"
location = var.location location = var.location
storage_class = "MULTI_REGIONAL" storage_class = "MULTI_REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null) encryption_key = try(local.service_encryption_keys.storage, null)
force_destroy = var.data_force_destroy force_destroy = var.data_force_destroy
} }
module "lake-2-cs-0" { module "dwh-conf-cs-0" {
source = "../../../modules/gcs" source = "../../../modules/gcs"
project_id = module.lake-2-project.project_id project_id = module.dwh-conf-project.project_id
prefix = var.prefix prefix = var.prefix
name = "dtl-2-cs-0" name = "dwh-conf-cs-0"
location = var.location location = var.location
storage_class = "MULTI_REGIONAL" storage_class = "MULTI_REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null) encryption_key = try(local.service_encryption_keys.storage, null)
force_destroy = var.data_force_destroy force_destroy = var.data_force_destroy
} }
module "lake-plg-cs-0" { module "dwh-plg-cs-0" {
source = "../../../modules/gcs" source = "../../../modules/gcs"
project_id = module.lake-plg-project.project_id project_id = module.dwh-plg-project.project_id
prefix = var.prefix prefix = var.prefix
name = "dtl-plg-cs-0" name = "dwh-plg-cs-0"
location = var.location location = var.location
storage_class = "MULTI_REGIONAL" storage_class = "MULTI_REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null) encryption_key = try(local.service_encryption_keys.storage, null)

View File

@ -13,7 +13,40 @@ Legend: <code>+</code> additive, <code>•</code> conditional.
|<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) | |<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | |<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) |
## Project <i>dtl-0</i> ## Project <i>drp</i>
| members | roles |
|---|---|
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/pubsub.editor](https://cloud.google.com/iam/docs/understanding-roles#pubsub.editor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>drp-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) |
|<b>drp-cs-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) |
|<b>drp-ps-0</b><br><small><i>serviceAccount</i></small>|[roles/pubsub.publisher](https://cloud.google.com/iam/docs/understanding-roles#pubsub.publisher) |
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user) <br>[roles/pubsub.subscriber](https://cloud.google.com/iam/docs/understanding-roles#pubsub.subscriber) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>orc-cmp-0</b><br><small><i>serviceAccount</i></small>|[roles/pubsub.subscriber](https://cloud.google.com/iam/docs/understanding-roles#pubsub.subscriber) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
## Project <i>dwh-conf</i>
| members | roles |
|---|---|
|<b>gcp-data-analysts</b><br><small><i>group</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer) <br>[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) |
|<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
## Project <i>dwh-cur</i>
| members | roles |
|---|---|
|<b>gcp-data-analysts</b><br><small><i>group</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer) <br>[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) |
|<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
## Project <i>dwh-lnd</i>
| members | roles | | members | roles |
|---|---| |---|---|
@ -24,29 +57,7 @@ Legend: <code>+</code> additive, <code>•</code> conditional.
|<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) | |<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) | |<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) |
## Project <i>dtl-1</i> ## Project <i>dwh-plg</i>
| members | roles |
|---|---|
|<b>gcp-data-analysts</b><br><small><i>group</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer) <br>[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) |
|<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
## Project <i>dtl-2</i>
| members | roles |
|---|---|
|<b>gcp-data-analysts</b><br><small><i>group</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/bigquery.metadataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.metadataViewer) <br>[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryAdmin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryAdmin) |
|<b>trf-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
## Project <i>dtl-plg</i>
| members | roles | | members | roles |
|---|---| |---|---|
@ -54,17 +65,6 @@ Legend: <code>+</code> additive, <code>•</code> conditional.
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | |<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>| |<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
## Project <i>lnd</i>
| members | roles |
|---|---|
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/pubsub.editor](https://cloud.google.com/iam/docs/understanding-roles#pubsub.editor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>lnd-bq-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) |
|<b>lnd-cs-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) |
|<b>lnd-ps-0</b><br><small><i>serviceAccount</i></small>|[roles/pubsub.publisher](https://cloud.google.com/iam/docs/understanding-roles#pubsub.publisher) |
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.user](https://cloud.google.com/iam/docs/understanding-roles#bigquery.user) <br>[roles/pubsub.subscriber](https://cloud.google.com/iam/docs/understanding-roles#pubsub.subscriber) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>orc-cmp-0</b><br><small><i>serviceAccount</i></small>|[roles/pubsub.subscriber](https://cloud.google.com/iam/docs/understanding-roles#pubsub.subscriber) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
## Project <i>lod</i> ## Project <i>lod</i>
| members | roles | | members | roles |

View File

@ -27,9 +27,9 @@ The code in this example doesn't address Organization-level configurations (Orga
The Data Platform is designed to rely on several projects, one project per data stage. The stages identified are: The Data Platform is designed to rely on several projects, one project per data stage. The stages identified are:
- landing - drop off
- load - load
- data lake - data warehouse
- orchestration - orchestration
- transformation - transformation
- exposure - exposure
@ -38,15 +38,15 @@ This separation into projects allows adhering to the least-privilege principle b
The script will create the following projects: The script will create the following projects:
- **Landing** Used to store temporary data. Data is pushed to Cloud Storage, BigQuery, or Cloud PubSub. Resources are configured with a customizable lifecycle policy. - **Drop off** Used to store temporary data. Data is pushed to Cloud Storage, BigQuery, or Cloud PubSub. Resources are configured with a customizable lifecycle policy.
- **Load** Used to load data from landing to data lake. The load is made with minimal to zero transformation logic (mainly `cast`). Anonymization or tokenization of Personally Identifiable Information (PII) can be implemented here or in the transformation stage, depending on your requirements. The use of [Cloud Dataflow templates](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates) is recommended. - **Load** Used to load data from the drop off zone to the data warehouse. The load is made with minimal to zero transformation logic (mainly `cast`). Anonymization or tokenization of Personally Identifiable Information (PII) can be implemented here or in the transformation stage, depending on your requirements. The use of [Cloud Dataflow templates](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates) is recommended.
- **Data Lake** Several projects distributed across 3 separate layers, to host progressively processed and refined data: - **Data Warehouse** Several projects distributed across 3 separate layers, to host progressively processed and refined data:
- **L0 - Raw data** Structured Data, stored in relevant formats: structured data stored in BigQuery, unstructured data stored on Cloud Storage with additional metadata stored in BigQuery (for example pictures stored in Cloud Storage and analysis of the images for Cloud Vision API stored in BigQuery). - **Landing - Raw data** Structured Data, stored in relevant formats: structured data stored in BigQuery, unstructured data stored on Cloud Storage with additional metadata stored in BigQuery (for example pictures stored in Cloud Storage and analysis of the images for Cloud Vision API stored in BigQuery).
- **L1 - Cleansed, aggregated and standardized data** - **Curated - Cleansed, aggregated and curated data**
- **L2 - Curated layer** - **Confidential - Curated and unencrypted layer**
- **Playground** Temporary tables that Data Analyst may use to perform R&D on data available in other Data Lake layers. - **Playground** Temporary tables that Data Analyst may use to perform R&D on data available in other Data Warehouse layers.
- **Orchestration** Used to host Cloud Composer, which orchestrates all tasks that move data across layers. - **Orchestration** Used to host Cloud Composer, which orchestrates all tasks that move data across layers.
- **Transformation** Used to move data between Data Lake layers. We strongly suggest relying on BigQuery Engine to perform the transformations. If BigQuery doesn't have the features needed to perform your transformations, you can use Cloud Dataflow with [Cloud Dataflow templates](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates). This stage can also optionally anonymize or tokenize PII. - **Transformation** Used to move data between Data Warehouse layers. We strongly suggest relying on BigQuery Engine to perform the transformations. If BigQuery doesn't have the features needed to perform your transformations, you can use Cloud Dataflow with [Cloud Dataflow templates](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates). This stage can also optionally anonymize or tokenize PII.
- **Exposure** Used to host resources that share processed data with external systems. Depending on the access pattern, data can be presented via Cloud SQL, BigQuery, or Bigtable. For BigQuery data, we strongly suggest relying on [Authorized views](https://cloud.google.com/bigquery/docs/authorized-views). - **Exposure** Used to host resources that share processed data with external systems. Depending on the access pattern, data can be presented via Cloud SQL, BigQuery, or Bigtable. For BigQuery data, we strongly suggest relying on [Authorized views](https://cloud.google.com/bigquery/docs/authorized-views).
### Roles ### Roles
@ -57,9 +57,9 @@ We assign roles on resources at the project level, granting the appropriate role
Service account creation follows the least privilege principle, performing a single task which requires access to a defined set of resources. The table below shows a high level overview of roles for each service account on each data layer, using `READ` or `WRITE` access patterns for simplicity. For detailed roles please refer to the code. Service account creation follows the least privilege principle, performing a single task which requires access to a defined set of resources. The table below shows a high level overview of roles for each service account on each data layer, using `READ` or `WRITE` access patterns for simplicity. For detailed roles please refer to the code.
|Service Account|Landing|DataLake L0|DataLake L1|DataLake L2| |Service Account|Drop off|DWH Landing|DWH Curated|DWH Confidential|
|-|:-:|:-:|:-:|:-:| |-|:-:|:-:|:-:|:-:|
|`landing-sa`|`WRITE`|-|-|-| |`drop-sa`|`WRITE`|-|-|-|
|`load-sa`|`READ`|`READ`/`WRITE`|-|-| |`load-sa`|`READ`|`READ`/`WRITE`|-|-|
|`transformation-sa`|-|`READ`/`WRITE`|`READ`/`WRITE`|`READ`/`WRITE`| |`transformation-sa`|-|`READ`/`WRITE`|`READ`/`WRITE`|`READ`/`WRITE`|
|`orchestration-sa`|-|-|-|-| |`orchestration-sa`|-|-|-|-|
@ -75,12 +75,12 @@ User groups provide a stable frame of reference that allows decoupling the final
We use three groups to control access to resources: We use three groups to control access to resources:
- *Data Engineers* They handle and run the Data Hub, with read access to all resources in order to troubleshoot possible issues with pipelines. This team can also impersonate any service account. - *Data Engineers* They handle and run the Data Hub, with read access to all resources in order to troubleshoot possible issues with pipelines. This team can also impersonate any service account.
- *Data Analysts*. They perform analysis on datasets, with read access to the data lake L2 project, and BigQuery READ/WRITE access to the playground project. - *Data Analysts*. They perform analysis on datasets, with read access to the Data Warehouse Confidential project, and BigQuery READ/WRITE access to the playground project.
- *Data Security*:. They handle security configurations related to the Data Hub. This team has admin access to the common project to configure Cloud DLP templates or Data Catalog policy tags. - *Data Security*:. They handle security configurations related to the Data Hub. This team has admin access to the common project to configure Cloud DLP templates or Data Catalog policy tags.
The table below shows a high level overview of roles for each group on each project, using `READ`, `WRITE` and `ADMIN` access patterns for simplicity. For detailed roles please refer to the code. The table below shows a high level overview of roles for each group on each project, using `READ`, `WRITE` and `ADMIN` access patterns for simplicity. For detailed roles please refer to the code.
|Group|Landing|Load|Transformation|Data Lake L0|Data Lake L1|Data Lake L2|Data Lake Playground|Orchestration|Common| |Group|Drop off|Load|Transformation|DHW Landing|DWH Curated|DWH Confidential|DWH Playground|Orchestration|Common|
|-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| |-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|Data Engineers|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`| |Data Engineers|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|
|Data Analysts|-|-|-|-|-|`READ`|`READ`/`WRITE`|-|-| |Data Analysts|-|-|-|-|-|`READ`|`READ`/`WRITE`|-|-|
@ -215,12 +215,12 @@ To create Cloud Key Management keys in the Data Platform you can uncomment the C
### Assign roles at BQ Dataset level ### Assign roles at BQ Dataset level
To handle multiple groups of `data-analysts` accessing the same Data Lake layer projects but only to the dataset belonging to a specific group, you may want to assign roles at BigQuery dataset level instead of at project-level. To handle multiple groups of `data-analysts` accessing the same Data Warehouse layer projects but only to the dataset belonging to a specific group, you may want to assign roles at BigQuery dataset level instead of at project-level.
To do this, you need to remove IAM binging at project-level for the `data-analysts` group and give roles at BigQuery dataset level using the `iam` variable on `bigquery-dataset` modules. To do this, you need to remove IAM binging at project-level for the `data-analysts` group and give roles at BigQuery dataset level using the `iam` variable on `bigquery-dataset` modules.
## Demo pipeline ## Demo pipeline
The application layer is out of scope of this script. As a demo purpuse only, several Cloud Composer DAGs are provided. Demos will import data from the `landing` area to the `DataLake L2` dataset suing different features. The application layer is out of scope of this script. As a demo purpuse only, several Cloud Composer DAGs are provided. Demos will import data from the `drop off` area to the `Data Warehouse Confidential` dataset suing different features.
You can find examples in the `[demo](./demo)` folder. You can find examples in the `[demo](./demo)` folder.

View File

@ -8,7 +8,7 @@ The example is not intended to be a production-ready code.
The demo imports purchase data generated by a store. The demo imports purchase data generated by a store.
## Input files ## Input files
Data are uploaded to the `landing` GCS bucket. File structure: Data are uploaded to the `drop off` GCS bucket. File structure:
- `customers.csv`: Comma separate value with customer information in the following format: Customer ID, Name, Surname, Registration Timestamp - `customers.csv`: Comma separate value with customer information in the following format: Customer ID, Name, Surname, Registration Timestamp
- `purchases.csv`: Comma separate value with customer information in the following format: Item ID, Customer ID, Item, Item price, Purchase Timestamp - `purchases.csv`: Comma separate value with customer information in the following format: Item ID, Customer ID, Item, Item price, Purchase Timestamp
@ -16,14 +16,14 @@ Data are uploaded to the `landing` GCS bucket. File structure:
Different data pipelines are provided to highlight different features and patterns. For the purpose of the example, a single pipeline handle all data lifecycles. When adapting them to your real use case, you may want to evaluate the option to handle each functional step on a separate pipeline or a dedicated tool. For example, you may want to use `Dataform` to handle data schemas lifecycle. Different data pipelines are provided to highlight different features and patterns. For the purpose of the example, a single pipeline handle all data lifecycles. When adapting them to your real use case, you may want to evaluate the option to handle each functional step on a separate pipeline or a dedicated tool. For example, you may want to use `Dataform` to handle data schemas lifecycle.
Below you can find a description of each example: Below you can find a description of each example:
- Simple import data: [`datapipeline.py`](./datapipeline.py) is a simple pipeline to import provided data from the `landing` Google Cloud Storage bucket to the Data Hub L2 layer joining `customers` and `purchases` tables into `customerpurchase` table. - Simple import data: [`datapipeline.py`](./datapipeline.py) is a simple pipeline to import provided data from the `drop off` Google Cloud Storage bucket to the Data Hub Confidential layer joining `customers` and `purchases` tables into `customerpurchase` table.
- Import data with Policy Tags: [`datapipeline_dc_tags.py`](./datapipeline.py) imports provided data from `landing` bucket to the Data Hub L2 layer protecting sensitive data using Data Catalog policy Tags. - Import data with Policy Tags: [`datapipeline_dc_tags.py`](./datapipeline.py) imports provided data from `drop off` bucket to the Data Hub Confidential layer protecting sensitive data using Data Catalog policy Tags.
- Delete tables: [`delete_table.py`](./delete_table.py) deletes BigQuery tables created by import pipelines. - Delete tables: [`delete_table.py`](./delete_table.py) deletes BigQuery tables created by import pipelines.
## Runnin the demo ## Runnin the demo
To run demo examples, please follow the following steps: To run demo examples, please follow the following steps:
- 01: copy sample data to the `landing` Cloud Storage bucket impersonating the `load` service account. - 01: copy sample data to the `drop off` Cloud Storage bucket impersonating the `load` service account.
- 02: copy sample data structure definition in the `orchestration` Cloud Storage bucket impersonating the `orchestration` service account. - 02: copy sample data structure definition in the `orchestration` Cloud Storage bucket impersonating the `orchestration` service account.
- 03: copy the Cloud Composer DAG to the Cloud Composer Storage bucket impersonating the `orchestration` service account. - 03: copy the Cloud Composer DAG to the Cloud Composer Storage bucket impersonating the `orchestration` service account.
- 04: Open the Cloud Composer Airflow UI and run the imported DAG. - 04: Open the Cloud Composer Airflow UI and run the imported DAG.

View File

@ -34,23 +34,23 @@ from airflow.utils.task_group import TaskGroup
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
BQ_LOCATION = os.environ.get("BQ_LOCATION") BQ_LOCATION = os.environ.get("BQ_LOCATION")
DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS")) DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS"))
DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ") DWH_LAND_PRJ = os.environ.get("DWH_LAND_PRJ")
DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET") DWH_LAND_BQ_DATASET = os.environ.get("DWH_LAND_BQ_DATASET")
DTL_L0_GCS = os.environ.get("DTL_L0_GCS") DWH_LAND_GCS = os.environ.get("DWH_LAND_GCS")
DTL_L1_PRJ = os.environ.get("DTL_L1_PRJ") DWH_CURATED_PRJ = os.environ.get("DWH_CURATED_PRJ")
DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET") DWH_CURATED_BQ_DATASET = os.environ.get("DWH_CURATED_BQ_DATASET")
DTL_L1_GCS = os.environ.get("DTL_L1_GCS") DWH_CURATED_GCS = os.environ.get("DWH_CURATED_GCS")
DTL_L2_PRJ = os.environ.get("DTL_L2_PRJ") DWH_CONFIDENTIAL_PRJ = os.environ.get("DWH_CONFIDENTIAL_PRJ")
DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET") DWH_CONFIDENTIAL_BQ_DATASET = os.environ.get("DWH_CONFIDENTIAL_BQ_DATASET")
DTL_L2_GCS = os.environ.get("DTL_L2_GCS") DWH_CONFIDENTIAL_GCS = os.environ.get("DWH_CONFIDENTIAL_GCS")
DTL_PLG_PRJ = os.environ.get("DTL_PLG_PRJ") DWH_PLG_PRJ = os.environ.get("DWH_PLG_PRJ")
DTL_PLG_BQ_DATASET = os.environ.get("DTL_PLG_BQ_DATASET") DWH_PLG_BQ_DATASET = os.environ.get("DWH_PLG_BQ_DATASET")
DTL_PLG_GCS = os.environ.get("DTL_PLG_GCS") DWH_PLG_GCS = os.environ.get("DWH_PLG_GCS")
GCP_REGION = os.environ.get("GCP_REGION") GCP_REGION = os.environ.get("GCP_REGION")
LND_PRJ = os.environ.get("LND_PRJ") DRP_PRJ = os.environ.get("DRP_PRJ")
LND_BQ = os.environ.get("LND_BQ") DRP_BQ = os.environ.get("DRP_BQ")
LND_GCS = os.environ.get("LND_GCS") DRP_GCS = os.environ.get("DRP_GCS")
LND_PS = os.environ.get("LND_PS") DRP_PS = os.environ.get("DRP_PS")
LOD_PRJ = os.environ.get("LOD_PRJ") LOD_PRJ = os.environ.get("LOD_PRJ")
LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING") LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING")
LOD_NET_VPC = os.environ.get("LOD_NET_VPC") LOD_NET_VPC = os.environ.get("LOD_NET_VPC")
@ -127,8 +127,8 @@ with models.DAG(
"javascriptTextTransformFunctionName": "transform", "javascriptTextTransformFunctionName": "transform",
"JSONPath": ORC_GCS + "/customers_schema.json", "JSONPath": ORC_GCS + "/customers_schema.json",
"javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js", "javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js",
"inputFilePattern": LND_GCS + "/customers.csv", "inputFilePattern": DRP_GCS + "/customers.csv",
"outputTable": DTL_L0_PRJ + ":"+DTL_L0_BQ_DATASET+".customers", "outputTable": DWH_LAND_PRJ + ":" + DWH_LAND_BQ_DATASET + ".customers",
"bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/",
}, },
) )
@ -142,8 +142,8 @@ with models.DAG(
"javascriptTextTransformFunctionName": "transform", "javascriptTextTransformFunctionName": "transform",
"JSONPath": ORC_GCS + "/purchases_schema.json", "JSONPath": ORC_GCS + "/purchases_schema.json",
"javascriptTextTransformGcsPath": ORC_GCS + "/purchases_udf.js", "javascriptTextTransformGcsPath": ORC_GCS + "/purchases_udf.js",
"inputFilePattern": LND_GCS + "/purchases.csv", "inputFilePattern": DRP_GCS + "/purchases.csv",
"outputTable": DTL_L0_PRJ + ":"+DTL_L0_BQ_DATASET+".purchases", "outputTable": DWH_LAND_PRJ + ":" + DWH_LAND_BQ_DATASET + ".purchases",
"bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/",
}, },
) )
@ -159,17 +159,15 @@ with models.DAG(
'query':"""SELECT 'query':"""SELECT
c.id as customer_id, c.id as customer_id,
p.id as purchase_id, p.id as purchase_id,
c.name as name,
c.surname as surname,
p.item as item, p.item as item,
p.price as price, p.price as price,
p.timestamp as timestamp p.timestamp as timestamp
FROM `{dtl_0_prj}.{dtl_0_dataset}.customers` c FROM `{dwh_0_prj}.{dwh_0_dataset}.customers` c
JOIN `{dtl_0_prj}.{dtl_0_dataset}.purchases` p ON c.id = p.customer_id JOIN `{dwh_0_prj}.{dwh_0_dataset}.purchases` p ON c.id = p.customer_id
""".format(dtl_0_prj=DTL_L0_PRJ, dtl_0_dataset=DTL_L0_BQ_DATASET, ), """.format(dwh_0_prj=DWH_LAND_PRJ, dwh_0_dataset=DWH_LAND_BQ_DATASET, ),
'destinationTable':{ 'destinationTable':{
'projectId': DTL_L1_PRJ, 'projectId': DWH_CURATED_PRJ,
'datasetId': DTL_L1_BQ_DATASET, 'datasetId': DWH_CURATED_BQ_DATASET,
'tableId': 'customer_purchase' 'tableId': 'customer_purchase'
}, },
'writeDisposition':'WRITE_TRUNCATE', 'writeDisposition':'WRITE_TRUNCATE',
@ -179,8 +177,8 @@ with models.DAG(
impersonation_chain=[TRF_SA_BQ] impersonation_chain=[TRF_SA_BQ]
) )
l2_customer_purchase = BigQueryInsertJobOperator( confidential_customer_purchase = BigQueryInsertJobOperator(
task_id='bq_l2_customer_purchase', task_id='bq_confidential_customer_purchase',
gcp_conn_id='bigquery_default', gcp_conn_id='bigquery_default',
project_id=TRF_PRJ, project_id=TRF_PRJ,
location=BQ_LOCATION, location=BQ_LOCATION,
@ -188,18 +186,19 @@ with models.DAG(
'jobType':'QUERY', 'jobType':'QUERY',
'query':{ 'query':{
'query':"""SELECT 'query':"""SELECT
customer_id, c.id as customer_id,
purchase_id, p.id as purchase_id,
name, c.name as name,
surname, c.surname as surname,
item, p.item as item,
price, p.price as price,
timestamp p.timestamp as timestamp
FROM `{dtl_1_prj}.{dtl_1_dataset}.customer_purchase` FROM `{dwh_0_prj}.{dwh_0_dataset}.customers` c
""".format(dtl_1_prj=DTL_L1_PRJ, dtl_1_dataset=DTL_L1_BQ_DATASET, ), JOIN `{dwh_0_prj}.{dwh_0_dataset}.purchases` p ON c.id = p.customer_id
""".format(dwh_0_prj=DWH_LAND_PRJ, dwh_0_dataset=DWH_LAND_BQ_DATASET, ),
'destinationTable':{ 'destinationTable':{
'projectId': DTL_L2_PRJ, 'projectId': DWH_CONFIDENTIAL_PRJ,
'datasetId': DTL_L2_BQ_DATASET, 'datasetId': DWH_CONFIDENTIAL_BQ_DATASET,
'tableId': 'customer_purchase' 'tableId': 'customer_purchase'
}, },
'writeDisposition':'WRITE_TRUNCATE', 'writeDisposition':'WRITE_TRUNCATE',
@ -209,4 +208,4 @@ with models.DAG(
impersonation_chain=[TRF_SA_BQ] impersonation_chain=[TRF_SA_BQ]
) )
start >> [customers_import, purchases_import] >> join_customer_purchase >> l2_customer_purchase >> end start >> [customers_import, purchases_import] >> join_customer_purchase >> confidential_customer_purchase >> end

View File

@ -34,23 +34,23 @@ from airflow.utils.task_group import TaskGroup
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
BQ_LOCATION = os.environ.get("BQ_LOCATION") BQ_LOCATION = os.environ.get("BQ_LOCATION")
DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS")) DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS"))
DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ") DWH_LAND_PRJ = os.environ.get("DWH_LAND_PRJ")
DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET") DWH_LAND_BQ_DATASET = os.environ.get("DWH_LAND_BQ_DATASET")
DTL_L0_GCS = os.environ.get("DTL_L0_GCS") DWH_LAND_GCS = os.environ.get("DWH_LAND_GCS")
DTL_L1_PRJ = os.environ.get("DTL_L1_PRJ") DWH_CURATED_PRJ = os.environ.get("DWH_CURATED_PRJ")
DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET") DWH_CURATED_BQ_DATASET = os.environ.get("DWH_CURATED_BQ_DATASET")
DTL_L1_GCS = os.environ.get("DTL_L1_GCS") DWH_CURATED_GCS = os.environ.get("DWH_CURATED_GCS")
DTL_L2_PRJ = os.environ.get("DTL_L2_PRJ") DWH_CONFIDENTIAL_PRJ = os.environ.get("DWH_CONFIDENTIAL_PRJ")
DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET") DWH_CONFIDENTIAL_BQ_DATASET = os.environ.get("DWH_CONFIDENTIAL_BQ_DATASET")
DTL_L2_GCS = os.environ.get("DTL_L2_GCS") DWH_CONFIDENTIAL_GCS = os.environ.get("DWH_CONFIDENTIAL_GCS")
DTL_PLG_PRJ = os.environ.get("DTL_PLG_PRJ") DWH_PLG_PRJ = os.environ.get("DWH_PLG_PRJ")
DTL_PLG_BQ_DATASET = os.environ.get("DTL_PLG_BQ_DATASET") DWH_PLG_BQ_DATASET = os.environ.get("DWH_PLG_BQ_DATASET")
DTL_PLG_GCS = os.environ.get("DTL_PLG_GCS") DWH_PLG_GCS = os.environ.get("DWH_PLG_GCS")
GCP_REGION = os.environ.get("GCP_REGION") GCP_REGION = os.environ.get("GCP_REGION")
LND_PRJ = os.environ.get("LND_PRJ") DRP_PRJ = os.environ.get("DRP_PRJ")
LND_BQ = os.environ.get("LND_BQ") DRP_BQ = os.environ.get("DRP_BQ")
LND_GCS = os.environ.get("LND_GCS") DRP_GCS = os.environ.get("DRP_GCS")
LND_PS = os.environ.get("LND_PS") DRP_PS = os.environ.get("DRP_PS")
LOD_PRJ = os.environ.get("LOD_PRJ") LOD_PRJ = os.environ.get("LOD_PRJ")
LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING") LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING")
LOD_NET_VPC = os.environ.get("LOD_NET_VPC") LOD_NET_VPC = os.environ.get("LOD_NET_VPC")
@ -121,8 +121,8 @@ with models.DAG(
with TaskGroup('upsert_table') as upsert_table: with TaskGroup('upsert_table') as upsert_table:
upsert_table_customers = BigQueryUpsertTableOperator( upsert_table_customers = BigQueryUpsertTableOperator(
task_id="upsert_table_customers", task_id="upsert_table_customers",
project_id=DTL_L0_PRJ, project_id=DWH_LAND_PRJ,
dataset_id=DTL_L0_BQ_DATASET, dataset_id=DWH_LAND_BQ_DATASET,
impersonation_chain=[TRF_SA_DF], impersonation_chain=[TRF_SA_DF],
table_resource={ table_resource={
"tableReference": {"tableId": "customers"}, "tableReference": {"tableId": "customers"},
@ -131,28 +131,28 @@ with models.DAG(
upsert_table_purchases = BigQueryUpsertTableOperator( upsert_table_purchases = BigQueryUpsertTableOperator(
task_id="upsert_table_purchases", task_id="upsert_table_purchases",
project_id=DTL_L0_PRJ, project_id=DWH_LAND_PRJ,
dataset_id=DTL_L0_BQ_DATASET, dataset_id=DWH_LAND_BQ_DATASET,
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
table_resource={ table_resource={
"tableReference": {"tableId": "purchases"} "tableReference": {"tableId": "purchases"}
}, },
) )
upsert_table_customer_purchase_l1 = BigQueryUpsertTableOperator( upsert_table_customer_purchase_curated = BigQueryUpsertTableOperator(
task_id="upsert_table_customer_purchase_l1", task_id="upsert_table_customer_purchase_curated",
project_id=DTL_L1_PRJ, project_id=DWH_CURATED_PRJ,
dataset_id=DTL_L1_BQ_DATASET, dataset_id=DWH_CURATED_BQ_DATASET,
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
table_resource={ table_resource={
"tableReference": {"tableId": "customer_purchase"} "tableReference": {"tableId": "customer_purchase"}
}, },
) )
upsert_table_customer_purchase_l2 = BigQueryUpsertTableOperator( upsert_table_customer_purchase_confidential = BigQueryUpsertTableOperator(
task_id="upsert_table_customer_purchase_l2", task_id="upsert_table_customer_purchase_confidential",
project_id=DTL_L2_PRJ, project_id=DWH_CONFIDENTIAL_PRJ,
dataset_id=DTL_L2_BQ_DATASET, dataset_id=DWH_CONFIDENTIAL_BQ_DATASET,
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
table_resource={ table_resource={
"tableReference": {"tableId": "customer_purchase"} "tableReference": {"tableId": "customer_purchase"}
@ -164,8 +164,8 @@ with models.DAG(
with TaskGroup('update_schema_table') as update_schema_table: with TaskGroup('update_schema_table') as update_schema_table:
update_table_schema_customers = BigQueryUpdateTableSchemaOperator( update_table_schema_customers = BigQueryUpdateTableSchemaOperator(
task_id="update_table_schema_customers", task_id="update_table_schema_customers",
project_id=DTL_L0_PRJ, project_id=DWH_LAND_PRJ,
dataset_id=DTL_L0_BQ_DATASET, dataset_id=DWH_LAND_BQ_DATASET,
table_id="customers", table_id="customers",
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
include_policy_tags=True, include_policy_tags=True,
@ -179,8 +179,8 @@ with models.DAG(
update_table_schema_customers = BigQueryUpdateTableSchemaOperator( update_table_schema_customers = BigQueryUpdateTableSchemaOperator(
task_id="update_table_schema_purchases", task_id="update_table_schema_purchases",
project_id=DTL_L0_PRJ, project_id=DWH_LAND_PRJ,
dataset_id=DTL_L0_BQ_DATASET, dataset_id=DWH_LAND_BQ_DATASET,
table_id="purchases", table_id="purchases",
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
include_policy_tags=True, include_policy_tags=True,
@ -193,10 +193,10 @@ with models.DAG(
] ]
) )
update_table_schema_customer_purchase_l1 = BigQueryUpdateTableSchemaOperator( update_table_schema_customer_purchase_curated = BigQueryUpdateTableSchemaOperator(
task_id="update_table_schema_customer_purchase_l1", task_id="update_table_schema_customer_purchase_curated",
project_id=DTL_L1_PRJ, project_id=DWH_CURATED_PRJ,
dataset_id=DTL_L1_BQ_DATASET, dataset_id=DWH_CURATED_BQ_DATASET,
table_id="customer_purchase", table_id="customer_purchase",
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
include_policy_tags=True, include_policy_tags=True,
@ -211,10 +211,10 @@ with models.DAG(
] ]
) )
update_table_schema_customer_purchase_l2 = BigQueryUpdateTableSchemaOperator( update_table_schema_customer_purchase_confidential = BigQueryUpdateTableSchemaOperator(
task_id="update_table_schema_customer_purchase_l2", task_id="update_table_schema_customer_purchase_confidential",
project_id=DTL_L2_PRJ, project_id=DWH_CONFIDENTIAL_PRJ,
dataset_id=DTL_L2_BQ_DATASET, dataset_id=DWH_CONFIDENTIAL_BQ_DATASET,
table_id="customer_purchase", table_id="customer_purchase",
impersonation_chain=[TRF_SA_BQ], impersonation_chain=[TRF_SA_BQ],
include_policy_tags=True, include_policy_tags=True,
@ -238,8 +238,8 @@ with models.DAG(
"javascriptTextTransformFunctionName": "transform", "javascriptTextTransformFunctionName": "transform",
"JSONPath": ORC_GCS + "/customers_schema.json", "JSONPath": ORC_GCS + "/customers_schema.json",
"javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js", "javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js",
"inputFilePattern": LND_GCS + "/customers.csv", "inputFilePattern": DRP_GCS + "/customers.csv",
"outputTable": DTL_L0_PRJ + ":"+DTL_L0_BQ_DATASET+".customers", "outputTable": DWH_LAND_PRJ + ":" + DWH_LAND_BQ_DATASET + ".customers",
"bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/",
}, },
) )
@ -253,8 +253,8 @@ with models.DAG(
"javascriptTextTransformFunctionName": "transform", "javascriptTextTransformFunctionName": "transform",
"JSONPath": ORC_GCS + "/purchases_schema.json", "JSONPath": ORC_GCS + "/purchases_schema.json",
"javascriptTextTransformGcsPath": ORC_GCS + "/purchases_udf.js", "javascriptTextTransformGcsPath": ORC_GCS + "/purchases_udf.js",
"inputFilePattern": LND_GCS + "/purchases.csv", "inputFilePattern": DRP_GCS + "/purchases.csv",
"outputTable": DTL_L0_PRJ + ":"+DTL_L0_BQ_DATASET+".purchases", "outputTable": DWH_LAND_PRJ + ":" + DWH_LAND_BQ_DATASET + ".purchases",
"bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/", "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/",
}, },
) )
@ -275,12 +275,12 @@ with models.DAG(
p.item as item, p.item as item,
p.price as price, p.price as price,
p.timestamp as timestamp p.timestamp as timestamp
FROM `{dtl_0_prj}.{dtl_0_dataset}.customers` c FROM `{dwh_0_prj}.{dwh_0_dataset}.customers` c
JOIN `{dtl_0_prj}.{dtl_0_dataset}.purchases` p ON c.id = p.customer_id JOIN `{dwh_0_prj}.{dwh_0_dataset}.purchases` p ON c.id = p.customer_id
""".format(dtl_0_prj=DTL_L0_PRJ, dtl_0_dataset=DTL_L0_BQ_DATASET, ), """.format(dwh_0_prj=DWH_LAND_PRJ, dwh_0_dataset=DWH_LAND_BQ_DATASET, ),
'destinationTable':{ 'destinationTable':{
'projectId': DTL_L1_PRJ, 'projectId': DWH_CURATED_PRJ,
'datasetId': DTL_L1_BQ_DATASET, 'datasetId': DWH_CURATED_BQ_DATASET,
'tableId': 'customer_purchase' 'tableId': 'customer_purchase'
}, },
'writeDisposition':'WRITE_TRUNCATE', 'writeDisposition':'WRITE_TRUNCATE',
@ -290,8 +290,8 @@ with models.DAG(
impersonation_chain=[TRF_SA_BQ] impersonation_chain=[TRF_SA_BQ]
) )
l2_customer_purchase = BigQueryInsertJobOperator( confidential_customer_purchase = BigQueryInsertJobOperator(
task_id='bq_l2_customer_purchase', task_id='bq_confidential_customer_purchase',
gcp_conn_id='bigquery_default', gcp_conn_id='bigquery_default',
project_id=TRF_PRJ, project_id=TRF_PRJ,
location=BQ_LOCATION, location=BQ_LOCATION,
@ -306,11 +306,11 @@ with models.DAG(
item, item,
price, price,
timestamp timestamp
FROM `{dtl_1_prj}.{dtl_1_dataset}.customer_purchase` FROM `{dwh_cur_prj}.{dwh_cur_dataset}.customer_purchase`
""".format(dtl_1_prj=DTL_L1_PRJ, dtl_1_dataset=DTL_L1_BQ_DATASET, ), """.format(dwh_cur_prj=DWH_CURATED_PRJ, dwh_cur_dataset=DWH_CURATED_BQ_DATASET, ),
'destinationTable':{ 'destinationTable':{
'projectId': DTL_L2_PRJ, 'projectId': DWH_CONFIDENTIAL_PRJ,
'datasetId': DTL_L2_BQ_DATASET, 'datasetId': DWH_CONFIDENTIAL_BQ_DATASET,
'tableId': 'customer_purchase' 'tableId': 'customer_purchase'
}, },
'writeDisposition':'WRITE_TRUNCATE', 'writeDisposition':'WRITE_TRUNCATE',
@ -319,4 +319,4 @@ with models.DAG(
}, },
impersonation_chain=[TRF_SA_BQ] impersonation_chain=[TRF_SA_BQ]
) )
start >> upsert_table >> update_schema_table >> [customers_import, purchases_import] >> join_customer_purchase >> l2_customer_purchase >> end start >> upsert_table >> update_schema_table >> [customers_import, purchases_import] >> join_customer_purchase >> confidential_customer_purchase >> end

View File

@ -34,23 +34,23 @@ from airflow.utils.task_group import TaskGroup
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
BQ_LOCATION = os.environ.get("BQ_LOCATION") BQ_LOCATION = os.environ.get("BQ_LOCATION")
DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS")) DATA_CAT_TAGS = json.loads(os.environ.get("DATA_CAT_TAGS"))
DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ") DWH_LAND_PRJ = os.environ.get("DWH_LAND_PRJ")
DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET") DWH_LAND_BQ_DATASET = os.environ.get("DWH_LAND_BQ_DATASET")
DTL_L0_GCS = os.environ.get("DTL_L0_GCS") DWH_LAND_GCS = os.environ.get("DWH_LAND_GCS")
DTL_L1_PRJ = os.environ.get("DTL_L1_PRJ") DWH_CURATED_PRJ = os.environ.get("DWH_CURATED_PRJ")
DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET") DWH_CURATED_BQ_DATASET = os.environ.get("DWH_CURATED_BQ_DATASET")
DTL_L1_GCS = os.environ.get("DTL_L1_GCS") DWH_CURATED_GCS = os.environ.get("DWH_CURATED_GCS")
DTL_L2_PRJ = os.environ.get("DTL_L2_PRJ") DWH_CONFIDENTIAL_PRJ = os.environ.get("DWH_CONFIDENTIAL_PRJ")
DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET") DWH_CONFIDENTIAL_BQ_DATASET = os.environ.get("DWH_CONFIDENTIAL_BQ_DATASET")
DTL_L2_GCS = os.environ.get("DTL_L2_GCS") DWH_CONFIDENTIAL_GCS = os.environ.get("DWH_CONFIDENTIAL_GCS")
DTL_PLG_PRJ = os.environ.get("DTL_PLG_PRJ") DWH_PLG_PRJ = os.environ.get("DWH_PLG_PRJ")
DTL_PLG_BQ_DATASET = os.environ.get("DTL_PLG_BQ_DATASET") DWH_PLG_BQ_DATASET = os.environ.get("DWH_PLG_BQ_DATASET")
DTL_PLG_GCS = os.environ.get("DTL_PLG_GCS") DWH_PLG_GCS = os.environ.get("DWH_PLG_GCS")
GCP_REGION = os.environ.get("GCP_REGION") GCP_REGION = os.environ.get("GCP_REGION")
LND_PRJ = os.environ.get("LND_PRJ") DRP_PRJ = os.environ.get("DRP_PRJ")
LND_BQ = os.environ.get("LND_BQ") DRP_BQ = os.environ.get("DRP_BQ")
LND_GCS = os.environ.get("LND_GCS") DRP_GCS = os.environ.get("DRP_GCS")
LND_PS = os.environ.get("LND_PS") DRP_PS = os.environ.get("DRP_PS")
LOD_PRJ = os.environ.get("LOD_PRJ") LOD_PRJ = os.environ.get("LOD_PRJ")
LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING") LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING")
LOD_NET_VPC = os.environ.get("LOD_NET_VPC") LOD_NET_VPC = os.environ.get("LOD_NET_VPC")
@ -121,25 +121,25 @@ with models.DAG(
with TaskGroup('delete_table') as delte_table: with TaskGroup('delete_table') as delte_table:
delete_table_customers = BigQueryDeleteTableOperator( delete_table_customers = BigQueryDeleteTableOperator(
task_id="delete_table_customers", task_id="delete_table_customers",
deletion_dataset_table=DTL_L0_PRJ+"."+DTL_L0_BQ_DATASET+".customers", deletion_dataset_table=DWH_LAND_PRJ+"."+DWH_LAND_BQ_DATASET+".customers",
impersonation_chain=[TRF_SA_DF] impersonation_chain=[TRF_SA_DF]
) )
delete_table_purchases = BigQueryDeleteTableOperator( delete_table_purchases = BigQueryDeleteTableOperator(
task_id="delete_table_purchases", task_id="delete_table_purchases",
deletion_dataset_table=DTL_L0_PRJ+"."+DTL_L0_BQ_DATASET+".purchases", deletion_dataset_table=DWH_LAND_PRJ+"."+DWH_LAND_BQ_DATASET+".purchases",
impersonation_chain=[TRF_SA_DF] impersonation_chain=[TRF_SA_DF]
) )
delete_table_customer_purchase_l1 = BigQueryDeleteTableOperator( delete_table_customer_purchase_curated = BigQueryDeleteTableOperator(
task_id="delete_table_customer_purchase_l1", task_id="delete_table_customer_purchase_curated",
deletion_dataset_table=DTL_L1_PRJ+"."+DTL_L1_BQ_DATASET+".customer_purchase", deletion_dataset_table=DWH_CURATED_PRJ+"."+DWH_CURATED_BQ_DATASET+".customer_purchase",
impersonation_chain=[TRF_SA_DF] impersonation_chain=[TRF_SA_DF]
) )
delete_table_customer_purchase_l2 = BigQueryDeleteTableOperator( delete_table_customer_purchase_confidential = BigQueryDeleteTableOperator(
task_id="delete_table_customer_purchase_l2", task_id="delete_table_customer_purchase_confidential",
deletion_dataset_table=DTL_L2_PRJ+"."+DTL_L2_BQ_DATASET+".customer_purchase", deletion_dataset_table=DWH_CONFIDENTIAL_PRJ+"."+DWH_CONFIDENTIAL_BQ_DATASET+".customer_purchase",
impersonation_chain=[TRF_SA_DF] impersonation_chain=[TRF_SA_DF]
) )

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

After

Width:  |  Height:  |  Size: 50 KiB

View File

@ -17,25 +17,25 @@
output "bigquery-datasets" { output "bigquery-datasets" {
description = "BigQuery datasets." description = "BigQuery datasets."
value = { value = {
land-bq-0 = module.land-bq-0.dataset_id, drop-bq-0 = module.drop-bq-0.dataset_id,
lake-0-bq-0 = module.lake-0-bq-0.dataset_id, dwh-landing-bq-0 = module.dwh-lnd-bq-0.dataset_id,
lake-1-bq-0 = module.lake-1-bq-0.dataset_id, dwh-curated-bq-0 = module.dwh-cur-bq-0.dataset_id,
lake-2-bq-0 = module.lake-2-bq-0.dataset_id, dwh-confidential-bq-0 = module.dwh-conf-bq-0.dataset_id,
lake-plg-bq-0 = module.lake-plg-bq-0.dataset_id, dwh-plg-bq-0 = module.dwh-plg-bq-0.dataset_id,
} }
} }
output "gcs-buckets" { output "gcs-buckets" {
description = "GCS buckets." description = "GCS buckets."
value = { value = {
lake-0-cs-0 = module.lake-0-cs-0.name, dwh-landing-cs-0 = module.dwh-lnd-cs-0.name,
lake-1-cs-0 = module.lake-1-cs-0.name, dwh-curated-cs-0 = module.dwh-cur-cs-0.name,
lake-2-cs-0 = module.lake-2-cs-0.name, dwh-confidential-cs-0 = module.dwh-conf-cs-0.name,
lake-plg-cs-0 = module.lake-plg-cs-0.name, dwh-plg-cs-0 = module.dwh-plg-cs-0.name,
land-cs-0 = module.land-cs-0.name, drop-cs-0 = module.drop-cs-0.name,
lod-cs-df = module.load-cs-df-0.name, lod-cs-df = module.load-cs-df-0.name,
orch-cs-0 = module.orch-cs-0.name, orch-cs-0 = module.orch-cs-0.name,
transf-cs-df = module.transf-cs-df-0.name, transf-cs-df = module.transf-cs-df-0.name,
} }
} }
@ -48,26 +48,26 @@ output "projects" {
description = "GCP Projects informations." description = "GCP Projects informations."
value = { value = {
project_number = { project_number = {
lake-0 = module.lake-0-project.number, dwh-landing = module.dwh-lnd-project.number,
lake-1 = module.lake-1-project.number, dwh-curated = module.dwh-cur-project.number,
lake-2 = module.lake-2-project.number, dwh-confidential = module.dwh-conf-project.number,
lake-plg = module.lake-plg-project.number, dwh-plg = module.dwh-plg-project.number,
exposure = module.exp-project.number, exposure = module.exp-project.number,
landing = module.land-project.number, dropoff = module.drop-project.number,
load = module.load-project.number, load = module.load-project.number,
orchestration = module.orch-project.number, orchestration = module.orch-project.number,
transformation = module.transf-project.number, transformation = module.transf-project.number,
} }
project_id = { project_id = {
lake-0 = module.lake-0-project.project_id, dwh-landing = module.dwh-lnd-project.project_id,
lake-1 = module.lake-1-project.project_id, dwh-curated = module.dwh-cur-project.project_id,
lake-2 = module.lake-2-project.project_id, dwh-confidential = module.dwh-conf-project.project_id,
lake-plg = module.lake-plg-project.project_id, dwh-plg = module.dwh-plg-project.project_id,
exposure = module.exp-project.project_id, exposure = module.exp-project.project_id,
landing = module.land-project.project_id, dropoff = module.drop-project.project_id,
load = module.load-project.project_id, load = module.load-project.project_id,
orchestration = module.orch-project.project_id, orchestration = module.orch-project.project_id,
transformation = module.transf-project.project_id, transformation = module.transf-project.project_id,
} }
} }
} }
@ -93,12 +93,12 @@ output "vpc_subnet" {
output "demo_commands" { output "demo_commands" {
description = "Demo commands." description = "Demo commands."
value = { value = {
01 = "gsutil -i ${module.land-sa-cs-0.email} cp demo/data/*.csv gs://${module.land-cs-0.name}" 01 = "gsutil -i ${module.drop-sa-cs-0.email} cp demo/data/*.csv gs://${module.drop-cs-0.name}"
02 = "gsutil -i ${module.orch-sa-cmp-0.email} cp demo/data/*.j* gs://${module.orch-cs-0.name}" 02 = "gsutil -i ${module.orch-sa-cmp-0.email} cp demo/data/*.j* gs://${module.orch-cs-0.name}"
03 = "gsutil -i ${module.orch-sa-cmp-0.email} cp demo/*.py ${google_composer_environment.orch-cmp-0.config[0].dag_gcs_prefix}/" 03 = "gsutil -i ${module.orch-sa-cmp-0.email} cp demo/*.py ${google_composer_environment.orch-cmp-0.config[0].dag_gcs_prefix}/"
04 = "Open ${google_composer_environment.orch-cmp-0.config.0.airflow_uri} and run uploaded DAG." 04 = "Open ${google_composer_environment.orch-cmp-0.config.0.airflow_uri} and run uploaded DAG."
05 = <<EOT 05 = <<EOT
bq query --project_id=${module.lake-2-project.project_id} --use_legacy_sql=false 'SELECT * EXCEPT (name, surname) FROM `${module.lake-2-project.project_id}.${module.lake-2-bq-0.dataset_id}.customer_purchase` LIMIT 1000'" bq query --project_id=${module.dwh-conf-project.project_id} --use_legacy_sql=false 'SELECT * EXCEPT (name, surname) FROM `${module.dwh-conf-project.project_id}.${module.dwh-conf-bq-0.dataset_id}.customer_purchase` LIMIT 1000'"
EOT EOT
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -31,10 +31,10 @@ The Data Platform manages:
As per our GCP best practices the Data Platform relies on user groups to assign roles to human identities. These are the specific groups used by the Data Platform and their access patterns, from the [module documentation](../../../../examples/data-solutions/data-platform-foundations/#groups): As per our GCP best practices the Data Platform relies on user groups to assign roles to human identities. These are the specific groups used by the Data Platform and their access patterns, from the [module documentation](../../../../examples/data-solutions/data-platform-foundations/#groups):
- *Data Engineers* They handle and run the Data Hub, with read access to all resources in order to troubleshoot possible issues with pipelines. This team can also impersonate any service account. - *Data Engineers* They handle and run the Data Hub, with read access to all resources in order to troubleshoot possible issues with pipelines. This team can also impersonate any service account.
- *Data Analysts*. They perform analysis on datasets, with read access to the data lake L2 project, and BigQuery READ/WRITE access to the playground project. - *Data Analysts*. They perform analysis on datasets, with read access to the data warehouse Curated or Confidential projects depending on their privileges, and BigQuery READ/WRITE access to the playground project.
- *Data Security*:. They handle security configurations related to the Data Hub. This team has admin access to the common project to configure Cloud DLP templates or Data Catalog policy tags. - *Data Security*:. They handle security configurations related to the Data Hub. This team has admin access to the common project to configure Cloud DLP templates or Data Catalog policy tags.
|Group|Landing|Load|Transformation|Data Lake L0|Data Lake L1|Data Lake L2|Data Lake Playground|Orchestration|Common| |Group|Landing|Load|Transformation|Data Warehouse Landing|Data Warehouse Curated|Data Warehouse Confidential|Data Warehouse Playground|Orchestration|Common|
|-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| |-|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|Data Engineers|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`| |Data Engineers|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|`ADMIN`|
|Data Analysts|-|-|-|-|-|`READ`|`READ`/`WRITE`|-|-| |Data Analysts|-|-|-|-|-|`READ`|`READ`/`WRITE`|-|-|
@ -69,6 +69,12 @@ As is often the case in real-world configurations, [VPC-SC](https://cloud.google
To configure the use of VPC-SC on the data platform, you have to specify the data platform project numbers on the `vpc_sc_perimeter_projects.dev` variable on [FAST security stage](../../02-security#perimeter-resources). To configure the use of VPC-SC on the data platform, you have to specify the data platform project numbers on the `vpc_sc_perimeter_projects.dev` variable on [FAST security stage](../../02-security#perimeter-resources).
In the case your Data Warehouse need to handle confidential data and you have the requirement to separate them deeply from other data and IAM is not enough, the suggested configuration is to keep the confidential project in a separate VPC-SC perimeter with the adequate ingress/egress rules needed for the load and tranformation service account. Below you can find an high level diagram describing the configuration.
<p align="center">
<img src="diagram_vpcsc.png" alt="Data Platform VPC-SC diagram">
</p>
## How to run this stage ## How to run this stage
This stage can be run in isolation by prviding the necessary variables, but it's really meant to be used as part of the FAST flow after the "foundational stages" ([`00-bootstrap`](../../00-bootstrap), [`01-resman`](../../01-resman), [`02-networking`](../../02-networking-vpn) and [`02-security`](../../02-security)). This stage can be run in isolation by prviding the necessary variables, but it's really meant to be used as part of the FAST flow after the "foundational stages" ([`00-bootstrap`](../../00-bootstrap), [`01-resman`](../../01-resman), [`02-networking`](../../02-networking-vpn) and [`02-security`](../../02-security)).
@ -131,7 +137,7 @@ terraform apply
## Demo pipeline ## Demo pipeline
The application layer is out of scope of this script. As a demo purpuse only, several Cloud Composer DAGs are provided. Demos will import data from the `landing` area to the `DataLake L2` dataset suing different features. The application layer is out of scope of this script. As a demo purpuse only, several Cloud Composer DAGs are provided. Demos will import data from the `landing` area to the `DataWarehouse Confidential` dataset suing different features.
You can find examples in the `[demo](../../../../examples/data-solutions/data-platform-foundations/demo)` folder. You can find examples in the `[demo](../../../../examples/data-solutions/data-platform-foundations/demo)` folder.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 115 KiB

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@ -1,4 +1,5 @@
click click
deepdiff
marko marko
requests requests
yamale yamale