Fix issues & readme

This commit is contained in:
Lorenzo Caggioni 2022-02-02 15:31:54 +01:00
parent 4d04933a57
commit 3c99074b3f
10 changed files with 160 additions and 74 deletions

View File

@ -51,9 +51,6 @@ resource "google_composer_environment" "orc-cmp-0" {
software_config {
image_version = "composer-1.17.5-airflow-2.1.4"
env_variables = {
DTL_EXP_PRJ = module.dtl-0-prj.project_id
DTL_EXP_BQ_DATASET = module.dtl-exp-bq-0.dataset_id
DTL_EXP_GCS = module.dtl-exp-cs-0.url
DTL_L0_PRJ = module.dtl-0-prj.project_id
DTL_L0_BQ_DATASET = module.dtl-0-bq-0.dataset_id
DTL_L0_GCS = module.dtl-0-cs-0.url
@ -63,6 +60,9 @@ resource "google_composer_environment" "orc-cmp-0" {
DTL_L2_PRJ = module.dtl-2-prj.project_id
DTL_L2_BQ_DATASET = module.dtl-2-bq-0.dataset_id
DTL_L2_GCS = module.dtl-2-cs-0.url
DTL_PLG_PRJ = module.dtl-plg-prj.project_id
DTL_PLG_BQ_DATASET = module.dtl-plg-bq-0.dataset_id
DTL_PLG_GCS = module.dtl-plg-cs-0.url
GCP_REGION = var.composer_config.region
LND_PRJ = module.lnd-prj.project_id
LND_BQ = module.lnd-bq-0.dataset_id
@ -77,6 +77,8 @@ resource "google_composer_environment" "orc-cmp-0" {
ORC_GCS = module.orc-cs-0.url
TRF_PRJ = module.trf-prj.project_id
TRF_GCS_STAGING = module.trf-cs-df-0.url
TRF_NET_VPC = module.trf-vpc[0].self_link
TRF_NET_SUBNET = module.trf-vpc[0].subnet_self_links["${var.composer_config.region}/${local.prefix_trf}-subnet"]
TRF_SA_DF = module.trf-sa-df-0.email
TRF_SA_BQ = module.trf-sa-bq-0.email
}

View File

@ -63,7 +63,7 @@ locals {
module "dtl-0-prj" {
source = "../../../modules/project"
name = var.project_create == null ? var.project_id["datalake"] : "${var.project_id["datalake"]}-0"
name = var.project_id["datalake-l0"]
parent = try(var.project_create.parent, null)
billing_account = try(var.project_create.billing_account_id, null)
project_create = var.project_create != null
@ -92,7 +92,7 @@ module "dtl-0-prj" {
module "dtl-1-prj" {
source = "../../../modules/project"
name = var.project_create == null ? var.project_id["datalake"] : "${var.project_id["datalake"]}-1"
name = var.project_id["datalake-l1"]
parent = try(var.project_create.parent, null)
billing_account = try(var.project_create.billing_account_id, null)
project_create = var.project_create != null
@ -121,7 +121,7 @@ module "dtl-1-prj" {
module "dtl-2-prj" {
source = "../../../modules/project"
name = var.project_create == null ? var.project_id["datalake"] : "${var.project_id["datalake"]}-2"
name = var.project_id["datalake-l2"]
parent = try(var.project_create.parent, null)
billing_account = try(var.project_create.billing_account_id, null)
project_create = var.project_create != null
@ -148,9 +148,9 @@ module "dtl-2-prj" {
}
}
module "dtl-exp-prj" {
module "dtl-plg-prj" {
source = "../../../modules/project"
name = var.project_create == null ? var.project_id["datalake"] : "${var.project_id["datalake"]}-exp"
name = var.project_id["datalake-playground"]
parent = try(var.project_create.parent, null)
billing_account = try(var.project_create.billing_account_id, null)
project_create = var.project_create != null

View File

@ -40,10 +40,10 @@ module "dtl-2-bq-0" {
encryption_key = try(local.service_encryption_keys.bq, null)
}
module "dtl-exp-bq-0" {
module "dtl-plg-bq-0" {
source = "../../../modules/bigquery-dataset"
project_id = module.dtl-exp-prj.project_id
id = "${replace(local.prefix_dtl, "-", "_")}_exp_bq_0"
project_id = module.dtl-plg-prj.project_id
id = "${replace(local.prefix_dtl, "-", "_")}_plg_bq_0"
location = var.location_config.region
encryption_key = try(local.service_encryption_keys.bq, null)
}
@ -83,10 +83,10 @@ module "dtl-2-cs-0" {
force_destroy = var.data_force_destroy
}
module "dtl-exp-cs-0" {
module "dtl-plg-cs-0" {
source = "../../../modules/gcs"
project_id = module.dtl-exp-prj.project_id
name = "exp-cs-0"
project_id = module.dtl-plg-prj.project_id
name = "plg-cs-0"
prefix = local.prefix_dtl
location = var.location_config.region
storage_class = "REGIONAL"

View File

@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# tfdoc:file:description Security project.
# tfdoc:file:description common project.
locals {
group_iam_sec = {
group_iam_cmn = {
"${local.groups.data-engineers}" = [
"roles/dlp.reader",
"roles/dlp.user",
@ -25,35 +25,35 @@ locals {
"roles/dlp.admin",
],
}
iam_sec = {
iam_cmn = {
"roles/dlp.user" = [
module.lod-sa-df-0.iam_email,
module.trf-sa-df-0.iam_email
]
}
prefix_sec = "${var.prefix}-sec"
prefix_cmn = "${var.prefix}-cmn"
}
# Project
module "sec-prj" {
module "cmn-prj" {
source = "../../../modules/project"
name = var.project_id["security"]
name = var.project_id["common"]
parent = try(var.project_create.parent, null)
billing_account = try(var.project_create.billing_account_id, null)
project_create = var.project_create != null
prefix = var.project_create == null ? null : var.prefix
# additive IAM bindings avoid disrupting bindings in existing project
iam = var.project_create != null ? local.iam_trf : {}
iam_additive = var.project_create == null ? local.iam_trf : {}
group_iam = local.group_iam_trf
iam = var.project_create != null ? local.iam_cmn : {}
iam_additive = var.project_create == null ? local.iam_cmn : {}
group_iam = local.group_iam_cmn
services = concat(var.project_services, [
"dlp.googleapis.com",
])
}
# Uncomment this section and assigne key links accondingly in local. variable
# if you want to create KMS keys in the security projet
# if you want to create KMS keys in the common projet
# module "sec-kms-0" {
# source = "../../../modules/kms"

View File

@ -0,0 +1,42 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# tfdoc:file:description common project.
locals {
group_iam_exp = {
#TODO add group => role mapping to asign on exposure project
}
iam_exp = {
#TODO add role => service account mapping to assign roles on exposure project
}
prefix_exp = "${var.prefix}-exp"
}
# Project
module "exp-prj" {
source = "../../../modules/project"
name = var.project_id["exposure"]
parent = try(var.project_create.parent, null)
billing_account = try(var.project_create.billing_account_id, null)
project_create = var.project_create != null
prefix = var.project_create == null ? null : var.prefix
# additive IAM bindings avoid disrupting bindings in existing project
iam = var.project_create != null ? local.iam_exp : {}
iam_additive = var.project_create == null ? local.iam_exp : {}
group_iam = local.group_iam_exp
services = concat(var.project_services, [
])
}

View File

@ -2,12 +2,14 @@
This module implements an opinionated Data Platform (DP) Architecture that creates and setup projects (and related resources) to be used to create your DP.
The code is intentionally simple, as it's intended to provide a generic initial setup (Networking, Security, etc.), and then allow easy customizations to complete the implementation of the intended hierarchy design.
The code is intentionally simple, as it's intended to provide a generic initial setup (Networking, Cloud Storage Buckets, BigQuery datasets, etc.), and then allow easy customizations to complete the implementation of the intended design.
The following diagram is a high-level reference of the resources created and managed here:
![Data Platform architecture overview](./images/overview_diagram.png "Data Platform architecture overview")
A demo pipeline is also part of this example: it can be built and run on top of the foundational infrastructure to quickly verify or test the setup.
## Design overview and choices
Despite its simplicity, this stage implements the basics of a design that we've seen working well for a variety of customers.
@ -37,10 +39,10 @@ The following projects will be created:
- **L0 - Raw data** Structured Data, stored in the adequate format: structured data stored in BigQuery, unstructured data stored on Cloud Storage with additional metadata stored in BigQuery (for example pictures stored in Cloud Storage and analysis of the picture for Cloud Vision API stored in BigQuery).
- **L1 - Cleansed, aggregated and standardized data**
- **L2 - Curated layer**
- **Experimental** Store temporary tables that Data Analyst may use to perform R&D on data available on other Data Lake layers
- **Playground** Store temporary tables that Data Analyst may use to perform R&D on data available on other Data Lake layers
- **Orchestration** This project is intended to host Cloud Composer. Cloud Composer will orchestrate all tasks to move your data on its journey.
- **Transformation** This project is intended to host resources to move data from one layer of the Data Lake to the other. We strongly suggest relying on BigQuery engine to perform transformations. If BigQuery doesn't have the feature needed to perform your transformation you suggest using Cloud Dataflow. The use of [Cloud Dataflow templates](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates) is suggested. Anonymization/tokenization Personally Identifiable Information can be applied at this stage or in the transformation stage depending on your requirements.
- **Exposure** This project is intended to host resources to expose your data. To expose BigQuery data, we strongly suggest relying on Authorized views. Other resources may better fit a particular data access pattern, example: Cloud SQL may be needed if you need to expose data with low latency, BigTable may be needed in a use case where you need lower latency to access data.
- **Exposure** This project is intended to host resources to expose your data. To expose BigQuery data, we strongly suggest relying on Authorized views. Other resources may better fit a particular data access pattern, example: Cloud SQL may be needed if you need to expose data with low latency, BigTable may be needed in a use case where you need lower latency to access data. For the porpuse of this example, no resources will be deployed on this project, please customize the exple as needed.
### Roles
We assigned roles on resources at project-level assigning the appropriate role to groups. We recommend not adding human users directly to the resource-access groups with IAM permissions to access data.
@ -60,7 +62,7 @@ Whilst necessary in some scenarios, such as programmatic access from on-premise
### Groups
As default groups, we identified the following actors:
- *Data Engineers*: the group that handles and runs the Data Hub. The group has Read access to all resources to be able to troubleshoot possible issues with the pipeline. The team also can impersonate all service accounts. Default value: `gcp-data-engineers@DOMAIN.COM`.
- *Data Analyst*: the group that performs analysis on the dataset. The group has Read access to the Data Lake L2 project and BigQuery READ/WRITE access to the `experimental` project. Default value: `gcp-data-analyst@DOMAIN.COM`
- *Data Analyst*: the group that performs analysis on the dataset. The group has Read access to the Data Lake L2 project and BigQuery READ/WRITE access to the `playground` project. Default value: `gcp-data-analyst@DOMAIN.COM`
- *Data Security*: the project that handles security features related to the Data Hub. Default name: `gcp-data-security@DOMAIN.com`
### Virtual Private Cloud (VPC) design
The DP except as input an existing [Shared-VPC](https://cloud.google.com/vpc/docs/shared-vpc) to run resources. You can configure subsets for DP resources specifying the link to the subnet in the `network_config` variable. You may want to configure a shared-VPC to run your resources in the case your pipelines may need to reach on-premise resources.
@ -74,7 +76,7 @@ To run your DP resources you need the following ranges:
- Cloud SQL. Range: '/24'
- GKE Master. Range: '/28'
- Web Server: Range: '/28'
- Secondary IP ranges. Pods range: '/22', Services range: '/24'
- Secondary IP ranges. Pods range: '/22', Services range: '/24'
### Resource naming convention
Resources in the script use the following acronyms:
@ -130,6 +132,11 @@ We implemented a centralized model for Cloud Data Loss Prevention resources. Tem
![Centralized Cloud Data Loss Prevention high-level diagram](./images/dlp_diagram.png "Centralized Cloud Data Loss Prevention high-level diagram")
## How to run this script
In order to bring up this example, you will need
- a folder or organization where new projects will be created
- a billing account that will be associated to new projects
The DP is meant to be executed by a Service Account (or a regular user) having this minimal set of permission:
* **Org level**:
* `"compute.organizations.enableXpnResource"`

View File

@ -29,26 +29,39 @@ from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJob
# --------------------------------------------------------------------------------
# Set variables
# --------------------------------------------------------------------------------
LND_GCS = os.environ.get("LND_GCS")
ORC_GCS = os.environ.get("ORC_GCS")
LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING")
DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET")
# ------------------------------------------------------------
DTL_L0_PRJ = os.environ.get("DTL_L0_PRJ")
DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET")
DTL_L0_BQ_DATASET = os.environ.get("DTL_L0_BQ_DATASET")
DTL_L0_GCS = os.environ.get("DTL_L0_GCS")
DTL_L1_PRJ = os.environ.get("DTL_L1_PRJ")
DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET")
DTL_L1_BQ_DATASET = os.environ.get("DTL_L1_BQ_DATASET")
DTL_L1_GCS = os.environ.get("DTL_L1_GCS")
DTL_L2_PRJ = os.environ.get("DTL_L2_PRJ")
DTL_L2_BQ_DATASET = os.environ.get("DTL_L2_BQ_DATASET")
DTL_L2_GCS = os.environ.get("DTL_L2_GCS")
DTL_PLG_PRJ = os.environ.get("DTL_PLG_PRJ")
DTL_PLG_BQ_DATASET = os.environ.get("DTL_PLG_BQ_DATASET")
DTL_PLG_GCS = os.environ.get("DTL_PLG_GCS")
GCP_REGION = os.environ.get("GCP_REGION")
LND_PRJ = os.environ.get("LND_PRJ")
LND_BQ = os.environ.get("LND_BQ")
LND_GCS = os.environ.get("LND_GCS")
LND_PS = os.environ.get("LND_PS")
LOD_PRJ = os.environ.get("LOD_PRJ")
DF_ZONE = os.environ.get("GCP_REGION") + "-b"
DF_REGION = BQ_REGION = os.environ.get("GCP_REGION")
LOD_GCS_STAGING = os.environ.get("LOD_GCS_STAGING")
LOD_NET_VPC = os.environ.get("LOD_NET_VPC")
LOD_NET_SUBNET = os.environ.get("LOD_NET_SUBNET")
LOD_SA_DF = os.environ.get("LOD_SA_DF")
ORC_PRJ = os.environ.get("ORC_PRJ")
ORC_GCS = os.environ.get("ORC_GCS")
TRF_PRJ = os.environ.get("TRF_PRJ")
TRF_GCS_STAGING = os.environ.get("TRF_GCS_STAGING")
TRF_NET_VPC = os.environ.get("TRF_NET_VPC")
TRF_NET_SUBNET = os.environ.get("TRF_NET_SUBNET")
TRF_SA_DF = os.environ.get("TRF_SA_DF")
TRF_SA_BQ = os.environ.get("TRF_SA_BQ")
TRF_PRJ = os.environ.get("TRF_PRJ")
DF_ZONE = os.environ.get("GCP_REGION") + "-b"
DF_REGION = BQ_REGION = os.environ.get("GCP_REGION")
# --------------------------------------------------------------------------------
# Set default arguments
@ -77,7 +90,7 @@ default_args = {
'serviceAccountEmail': LOD_SA_DF,
'subnetwork': LOD_NET_SUBNET,
'ipConfiguration': "WORKER_IP_PRIVATE"
},
},
}
# --------------------------------------------------------------------------------
@ -131,9 +144,8 @@ with models.DAG(
location=BQ_REGION,
configuration={
'jobType':'QUERY',
'writeDisposition':'WRITE_TRUNCATE',
'query':{
'query':"""SELECT
'query':"""SELECT
c.id as customer_id,
p.id as purchase_id,
c.name as name,
@ -142,17 +154,18 @@ with models.DAG(
p.price as price,
p.timestamp as timestamp
FROM `{dtl_0_prj}.{dtl_0_dataset}.customers` c
JOIN `{dtl_0_prj}.{dtl_0_dataset}.purchases` p ON c.id = p.customer_id
JOIN `{dtl_0_prj}.{dtl_0_dataset}.purchases` p ON c.id = p.customer_id
""".format(dtl_0_prj=DTL_L0_PRJ, dtl_0_dataset=DTL_L0_BQ_DATASET, ),
'destinationTable':{
'projectId': DTL_L1_PRJ,
'datasetId': DTL_L1_BQ_DATASET,
'tableId': 'customer_purchase'
'tableId': 'customer_purchase'
},
'writeDisposition':'WRITE_TRUNCATE',
"useLegacySql": False
}
},
impersonation_chain=[TRF_SA_BQ]
impersonation_chain=[TRF_SA_BQ]
)
l2_customer_purchase = BigQueryInsertJobOperator(
@ -162,9 +175,8 @@ with models.DAG(
location=BQ_REGION,
configuration={
'jobType':'QUERY',
'writeDisposition':'WRITE_TRUNCATE',
'query':{
'query':"""SELECT
'query':"""SELECT
customer_id,
purchase_id,
name,
@ -177,12 +189,13 @@ with models.DAG(
'destinationTable':{
'projectId': DTL_L2_PRJ,
'datasetId': DTL_L2_BQ_DATASET,
'tableId': 'customer_purchase'
'tableId': 'customer_purchase'
},
'writeDisposition':'WRITE_TRUNCATE',
"useLegacySql": False
}
},
impersonation_chain=[TRF_SA_BQ]
impersonation_chain=[TRF_SA_BQ]
)
start >> [customers_import, purchases_import] >> join_customer_purchase >> l2_customer_purchase >> end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 63 KiB

After

Width:  |  Height:  |  Size: 78 KiB

View File

@ -21,7 +21,7 @@ output "bigquery-datasets" {
dtl-0-bq-0 = module.dtl-0-bq-0.dataset_id,
dtl-1-bq-0 = module.dtl-1-bq-0.dataset_id,
dtl-2-bq-0 = module.dtl-2-bq-0.dataset_id,
dtl-exp-bq-0 = module.dtl-exp-bq-0.dataset_id,
dtl-plg-bq-0 = module.dtl-plg-bq-0.dataset_id,
}
}
@ -31,7 +31,7 @@ output "gcs-buckets" {
dtl-0-cs-0 = module.dtl-0-cs-0.name,
dtl-1-cs-0 = module.dtl-1-cs-0.name,
dtl-2-cs-0 = module.dtl-2-cs-0.name,
dtl-exp-cs-0 = module.dtl-exp-cs-0.name,
dtl-plg-cs-0 = module.dtl-plg-cs-0.name,
lnd-cs-0 = module.lnd-cs-0.name,
lod-cs-df = module.lod-cs-df-0.name,
orc-cs-0 = module.orc-cs-0.name,
@ -47,14 +47,28 @@ output "kms_keys" {
output "projects" {
description = "GCP Projects."
value = {
lnd-prj = module.lnd-prj.project_id,
lod-prj = module.lod-prj.project_id,
orc-prj = module.orc-prj.project_id,
trf-prj = module.trf-prj.project_id,
dtl-0-prj = module.dtl-0-prj.project_id,
dtl-1-prj = module.dtl-1-prj.project_id,
dtl-2-prj = module.dtl-2-prj.project_id,
dtl-exp-prj = module.dtl-exp-prj.project_id,
project_number = {
dtl-0-prj = module.dtl-0-prj.number,
dtl-1-prj = module.dtl-1-prj.number,
dtl-2-prj = module.dtl-2-prj.number,
dtl-plg-prj = module.dtl-plg-prj.number,
exp-prj = module.exp-prj.number,
lnd-prj = module.lnd-prj.number,
lod-prj = module.lod-prj.number,
orc-prj = module.orc-prj.number,
trf-prj = module.trf-prj.number,
}
project_id = {
dtl-0-prj = module.dtl-0-prj.project_id,
dtl-1-prj = module.dtl-1-prj.project_id,
dtl-2-prj = module.dtl-2-prj.project_id,
dtl-plg-prj = module.dtl-plg-prj.project_id,
exp-prj = module.exp-prj.project_id,
lnd-prj = module.lnd-prj.project_id,
lod-prj = module.lod-prj.project_id,
orc-prj = module.orc-prj.project_id,
trf-prj = module.trf-prj.project_id,
}
}
}
@ -72,7 +86,7 @@ output "demo_commands" {
value = {
01 = "gsutil -i ${module.lnd-sa-cs-0.email} cp demo/data/*.csv gs://${module.lnd-cs-0.name}"
02 = "gsutil -i ${module.orc-sa-cmp-0.email} cp demo/data/*.j* gs://${module.orc-cs-0.name}"
03 = "gsutil -i ${module.orc-sa-cmp-0.email} cp demo/gcs2bq.py ${google_composer_environment.orc-cmp-0.config[0].dag_gcs_prefix}/"
03 = "gsutil -i ${module.orc-sa-cmp-0.email} cp demo/*.py ${google_composer_environment.orc-cmp-0.config[0].dag_gcs_prefix}/"
04 = "Open: ${google_composer_environment.orc-cmp-0.config.0.airflow_uri}"
05 = <<EOT
bq query --project_id=${module.dtl-2-prj.project_id} --use_legacy_sql=false 'SELECT * FROM `${module.dtl-2-prj.project_id}.${module.dtl-2-bq-0.dataset_id}.customer_purchase` LIMIT 1000'"

View File

@ -97,20 +97,28 @@ variable "project_create" {
variable "project_id" {
description = "Project id, references existing project if `project_create` is null."
type = object({
landing = string
load = string
orchestration = string
trasformation = string
datalake = string
security = string
landing = string
load = string
orchestration = string
trasformation = string
datalake-l0 = string
datalake-l1 = string
datalake-l2 = string
datalake-playground = string
common = string
exposure = string
})
default = {
landing = "lnd"
load = "lod"
orchestration = "orc"
trasformation = "trf"
datalake = "dtl"
security = "sec"
landing = "lnd"
load = "lod"
orchestration = "orc"
trasformation = "trf"
datalake-l0 = "dtl-0"
datalake-l1 = "dtl-1"
datalake-l2 = "dtl-2"
datalake-playground = "dtl-plg"
common = "cmn"
exposure = "exp"
}
}