Improve Minimal Data Platform Blueprint (#1473)

* Add SA to access to Curated resources

* Add BQ dataset in the landing project

* Provide example to move data from landing to curated using BQ engine

* Improve diagram
This commit is contained in:
lcaggio 2023-06-28 09:05:48 +02:00 committed by GitHub
parent 6fcb010ff2
commit 099ad03910
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 196 additions and 74 deletions

View File

@ -16,7 +16,7 @@
locals {
iam_lnd = {
"roles/storage.objectCreator" = [module.land-sa-cs-0.iam_email]
"roles/storage.objectCreator" = [module.land-sa-0.iam_email]
"roles/storage.objectViewer" = [module.processing-sa-cmp-0.iam_email]
"roles/storage.objectAdmin" = [module.processing-sa-0.iam_email]
}
@ -36,6 +36,9 @@ module "land-project" {
iam = var.project_config.billing_account_id != null ? local.iam_lnd : null
iam_additive = var.project_config.billing_account_id == null ? local.iam_lnd : null
services = [
"bigquery.googleapis.com",
"bigqueryreservation.googleapis.com",
"bigquerystorage.googleapis.com",
"cloudkms.googleapis.com",
"cloudresourcemanager.googleapis.com",
"iam.googleapis.com",
@ -52,12 +55,12 @@ module "land-project" {
# Cloud Storage
module "land-sa-cs-0" {
module "land-sa-0" {
source = "../../../modules/iam-service-account"
project_id = module.land-project.project_id
prefix = var.prefix
name = "lnd-cs-0"
display_name = "Data platform GCS landing service account."
name = "lnd-sa-0"
display_name = "Data platform landing zone service account."
iam = {
"roles/iam.serviceAccountTokenCreator" = [
local.groups_iam.data-engineers
@ -75,3 +78,11 @@ module "land-cs-0" {
encryption_key = var.service_encryption_keys.storage
force_destroy = var.data_force_destroy
}
module "land-bq-0" {
source = "../../../modules/bigquery-dataset"
project_id = module.land-project.project_id
id = "${replace(var.prefix, "-", "_")}_lnd_bq_0"
location = var.location
encryption_key = var.service_encryption_keys.bq
}

View File

@ -49,7 +49,7 @@ locals {
}
processing_subnet = (
local.use_shared_vpc
? var.network_config.subnet_self_links.processing_transformation
? var.network_config.subnet_self_link
: module.processing-vpc.0.subnet_self_links["${var.region}/${var.prefix}-processing"]
)
processing_vpc = (
@ -57,8 +57,6 @@ locals {
? var.network_config.network_self_link
: module.processing-vpc.0.self_link
)
}
module "processing-project" {

View File

@ -18,33 +18,41 @@ locals {
cur_iam = {
"roles/bigquery.dataOwner" = [module.processing-sa-0.iam_email]
"roles/bigquery.dataViewer" = [
module.cur-sa-0.iam_email,
local.groups_iam.data-analysts,
local.groups_iam.data-engineers
]
"roles/bigquery.jobUser" = [
module.processing-sa-0.iam_email,
module.processing-sa-0.iam_email, # Remove once bug is fixed. https://github.com/apache/airflow/issues/32106
module.cur-sa-0.iam_email,
local.groups_iam.data-analysts,
local.groups_iam.data-engineers
]
"roles/datacatalog.tagTemplateViewer" = [
local.groups_iam.data-analysts, local.groups_iam.data-engineers
module.cur-sa-0.iam_email,
local.groups_iam.data-analysts,
local.groups_iam.data-engineers
]
"roles/datacatalog.viewer" = [
local.groups_iam.data-analysts, local.groups_iam.data-engineers
module.cur-sa-0.iam_email,
local.groups_iam.data-analysts,
local.groups_iam.data-engineers
]
"roles/storage.objectViewer" = [
local.groups_iam.data-analysts, local.groups_iam.data-engineers
module.cur-sa-0.iam_email,
local.groups_iam.data-analysts,
local.groups_iam.data-engineers
]
"roles/storage.objectAdmin" = [module.processing-sa-0.iam_email]
}
cur_services = [
"iam.googleapis.com",
"bigquery.googleapis.com",
"bigqueryreservation.googleapis.com",
"bigquerystorage.googleapis.com",
"cloudkms.googleapis.com",
"cloudresourcemanager.googleapis.com",
"compute.googleapis.com",
"iam.googleapis.com",
"servicenetworking.googleapis.com",
"serviceusage.googleapis.com",
"stackdriver.googleapis.com",
@ -75,6 +83,19 @@ module "cur-project" {
}
}
module "cur-sa-0" {
source = "../../../modules/iam-service-account"
project_id = module.cur-project.project_id
prefix = var.prefix
name = "cur-sa-0"
display_name = "Data platform curated zone service account."
iam = {
"roles/iam.serviceAccountTokenCreator" = [
local.groups_iam.data-engineers
]
}
}
# Bigquery
module "cur-bq-0" {

View File

@ -9,7 +9,7 @@ Legend: <code>+</code> additive, <code>•</code> conditional.
|<b>gcp-data-analysts</b><br><small><i>group</i></small>|[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/dlp.estimatesAdmin](https://cloud.google.com/iam/docs/understanding-roles#dlp.estimatesAdmin) <br>[roles/dlp.reader](https://cloud.google.com/iam/docs/understanding-roles#dlp.reader) <br>[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) |
|<b>gcp-data-security</b><br><small><i>group</i></small>|[roles/datacatalog.admin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.admin) <br>[roles/dlp.admin](https://cloud.google.com/iam/docs/understanding-roles#dlp.admin) |
|<b>prc-dp-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) |
|<b>prc-0</b><br><small><i>serviceAccount</i></small>|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) |
## Project <i>cur</i>
@ -18,15 +18,16 @@ Legend: <code>+</code> additive, <code>•</code> conditional.
|<b>gcp-data-analysts</b><br><small><i>group</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>prc-dp-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>cur-sa-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer) <br>[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>prc-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
## Project <i>lnd</i>
| members | roles |
|---|---|
|<b>lnd-cs-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) |
|<b>lnd-sa-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) |
|<b>prc-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>prc-cmp-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>prc-dp-0</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
## Project <i>prc</i>
@ -35,5 +36,5 @@ Legend: <code>+</code> additive, <code>•</code> conditional.
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin) <br>[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin) <br>[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) <br>[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor) <br>[roles/serviceusage.serviceUsageConsumer](https://cloud.google.com/iam/docs/understanding-roles#serviceusage.serviceUsageConsumer) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_cloudcomposer-accounts</b><br><small><i>serviceAccount</i></small>|[roles/composer.ServiceAgentV2Ext](https://cloud.google.com/iam/docs/understanding-roles#composer.ServiceAgentV2Ext) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>prc-cmp-0</b><br><small><i>serviceAccount</i></small>|[roles/composer.worker](https://cloud.google.com/iam/docs/understanding-roles#composer.worker) <br>[roles/dataproc.editor](https://cloud.google.com/iam/docs/understanding-roles#dataproc.editor) <br>[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |
|<b>prc-dp-0</b><br><small><i>serviceAccount</i></small>|[roles/dataproc.worker](https://cloud.google.com/iam/docs/understanding-roles#dataproc.worker) |
|<b>prc-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/dataflow.worker](https://cloud.google.com/iam/docs/understanding-roles#dataflow.worker) <br>[roles/dataproc.worker](https://cloud.google.com/iam/docs/understanding-roles#dataproc.worker) |
|<b>prc-cmp-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/composer.worker](https://cloud.google.com/iam/docs/understanding-roles#composer.worker) <br>[roles/dataflow.admin](https://cloud.google.com/iam/docs/understanding-roles#dataflow.admin) <br>[roles/dataproc.editor](https://cloud.google.com/iam/docs/understanding-roles#dataproc.editor) <br>[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) |

View File

@ -203,7 +203,7 @@ module "data-platform" {
prefix = "myprefix"
}
# tftest modules=21 resources=116
# tftest modules=23 resources=123
```
## Customizations
@ -229,10 +229,7 @@ To configure the use of a shared VPC, configure the `network_config`, example:
network_config = {
host_project = "PROJECT_ID"
network_self_link = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/global/networks/NAME"
subnet_self_links = {
processing_transformation = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME"
processing_composer = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME"
}
subnet_self_link = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME"
composer_ip_ranges = {
cloudsql = "192.168.XXX.XXX/24"
gke_master = "192.168.XXX.XXX/28"
@ -280,32 +277,31 @@ The application layer is out of scope of this script. As a demo purpuse only, on
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| [organization_domain](variables.tf#L122) | Organization domain. | <code>string</code> | ✓ | |
| [prefix](variables.tf#L127) | Prefix used for resource names. | <code>string</code> | ✓ | |
| [project_config](variables.tf#L136) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | <code title="object&#40;&#123;&#10; billing_account_id &#61; optional&#40;string, null&#41;&#10; parent &#61; string&#10; project_ids &#61; optional&#40;object&#40;&#123;&#10; landing &#61; string&#10; processing &#61; string&#10; curated &#61; string&#10; common &#61; string&#10; &#125;&#41;, &#123;&#10; landing &#61; &#34;lnd&#34;&#10; processing &#61; &#34;prc&#34;&#10; curated &#61; &#34;cur&#34;&#10; common &#61; &#34;cmn&#34;&#10; &#125;&#10; &#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| [organization_domain](variables.tf#L119) | Organization domain. | <code>string</code> | ✓ | |
| [prefix](variables.tf#L124) | Prefix used for resource names. | <code>string</code> | ✓ | |
| [project_config](variables.tf#L133) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | <code title="object&#40;&#123;&#10; billing_account_id &#61; optional&#40;string, null&#41;&#10; parent &#61; string&#10; project_ids &#61; optional&#40;object&#40;&#123;&#10; landing &#61; string&#10; processing &#61; string&#10; curated &#61; string&#10; common &#61; string&#10; &#125;&#41;, &#123;&#10; landing &#61; &#34;lnd&#34;&#10; processing &#61; &#34;prc&#34;&#10; curated &#61; &#34;cur&#34;&#10; common &#61; &#34;cmn&#34;&#10; &#125;&#10; &#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| [composer_config](variables.tf#L17) | Cloud Composer config. | <code title="object&#40;&#123;&#10; environment_size &#61; optional&#40;string, &#34;ENVIRONMENT_SIZE_SMALL&#34;&#41;&#10; software_config &#61; optional&#40;object&#40;&#123;&#10; airflow_config_overrides &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; pypi_packages &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; env_variables &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; image_version &#61; optional&#40;string, &#34;composer-2-airflow-2&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; workloads_config &#61; optional&#40;object&#40;&#123;&#10; scheduler &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; count &#61; optional&#40;number, 1&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; web_server &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; worker &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; min_count &#61; optional&#40;number, 1&#41;&#10; max_count &#61; optional&#40;number, 3&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [data_catalog_tags](variables.tf#L54) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | <code>map&#40;map&#40;list&#40;string&#41;&#41;&#41;</code> | | <code title="&#123;&#10; &#34;3_Confidential&#34; &#61; null&#10; &#34;2_Private&#34; &#61; null&#10; &#34;1_Sensitive&#34; &#61; null&#10;&#125;">&#123;&#8230;&#125;</code> |
| [data_force_destroy](variables.tf#L65) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | <code>bool</code> | | <code>false</code> |
| [enable_services](variables.tf#L71) | Flag to enable or disable services in the Data Platform. | <code title="object&#40;&#123;&#10; composer &#61; optional&#40;bool, true&#41;&#10; dataproc_history_server &#61; optional&#40;bool, true&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [groups](variables.tf#L80) | User groups. | <code>map&#40;string&#41;</code> | | <code title="&#123;&#10; data-analysts &#61; &#34;gcp-data-analysts&#34;&#10; data-engineers &#61; &#34;gcp-data-engineers&#34;&#10; data-security &#61; &#34;gcp-data-security&#34;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [location](variables.tf#L90) | Location used for multi-regional resources. | <code>string</code> | | <code>&#34;eu&#34;</code> |
| [network_config](variables.tf#L96) | Shared VPC network configurations to use. If null networks will be created in projects. | <code title="object&#40;&#123;&#10; host_project &#61; optional&#40;string&#41;&#10; network_self_link &#61; optional&#40;string&#41;&#10; subnet_self_links &#61; optional&#40;object&#40;&#123;&#10; processing_transformation &#61; string&#10; processing_composer &#61; string&#10; &#125;&#41;, null&#41;&#10; composer_ip_ranges &#61; optional&#40;object&#40;&#123;&#10; connection_subnetwork &#61; optional&#40;string&#41;&#10; cloud_sql &#61; optional&#40;string, &#34;10.20.10.0&#47;24&#34;&#41;&#10; gke_master &#61; optional&#40;string, &#34;10.20.11.0&#47;28&#34;&#41;&#10; pods_range_name &#61; optional&#40;string, &#34;pods&#34;&#41;&#10; services_range_name &#61; optional&#40;string, &#34;services&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [project_suffix](variables.tf#L160) | Suffix used only for project ids. | <code>string</code> | | <code>null</code> |
| [region](variables.tf#L166) | Region used for regional resources. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| [service_encryption_keys](variables.tf#L172) | Cloud KMS to use to encrypt different services. Key location should match service region. | <code title="object&#40;&#123;&#10; bq &#61; optional&#40;string&#41;&#10; composer &#61; optional&#40;string&#41;&#10; compute &#61; optional&#40;string&#41;&#10; storage &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [network_config](variables.tf#L96) | Shared VPC network configurations to use. If null networks will be created in projects. | <code title="object&#40;&#123;&#10; host_project &#61; optional&#40;string&#41;&#10; network_self_link &#61; optional&#40;string&#41;&#10; subnet_self_link &#61; optional&#40;string&#41;&#10; composer_ip_ranges &#61; optional&#40;object&#40;&#123;&#10; connection_subnetwork &#61; optional&#40;string&#41;&#10; cloud_sql &#61; optional&#40;string, &#34;10.20.10.0&#47;24&#34;&#41;&#10; gke_master &#61; optional&#40;string, &#34;10.20.11.0&#47;28&#34;&#41;&#10; pods_range_name &#61; optional&#40;string, &#34;pods&#34;&#41;&#10; services_range_name &#61; optional&#40;string, &#34;services&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [project_suffix](variables.tf#L157) | Suffix used only for project ids. | <code>string</code> | | <code>null</code> |
| [region](variables.tf#L163) | Region used for regional resources. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| [service_encryption_keys](variables.tf#L169) | Cloud KMS to use to encrypt different services. Key location should match service region. | <code title="object&#40;&#123;&#10; bq &#61; optional&#40;string&#41;&#10; composer &#61; optional&#40;string&#41;&#10; compute &#61; optional&#40;string&#41;&#10; storage &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
## Outputs
| name | description | sensitive |
|---|---|:---:|
| [bigquery-datasets](outputs.tf#L17) | BigQuery datasets. | |
| [composer](outputs.tf#L24) | Composer variables. | |
| [dataproc-history-server](outputs.tf#L31) | List of bucket names which have been assigned to the cluster. | |
| [gcs_buckets](outputs.tf#L36) | GCS buckets. | |
| [kms_keys](outputs.tf#L46) | Cloud MKS keys. | |
| [projects](outputs.tf#L51) | GCP Projects informations. | |
| [service_accounts](outputs.tf#L69) | Service account created. | |
| [vpc_network](outputs.tf#L78) | VPC network. | |
| [vpc_subnet](outputs.tf#L86) | VPC subnetworks. | |
| [composer](outputs.tf#L25) | Composer variables. | |
| [dataproc-history-server](outputs.tf#L33) | List of bucket names which have been assigned to the cluster. | |
| [gcs_buckets](outputs.tf#L38) | GCS buckets. | |
| [kms_keys](outputs.tf#L47) | Cloud MKS keys. | |
| [network](outputs.tf#L52) | VPC network. | |
| [projects](outputs.tf#L60) | GCP Projects informations. | |
| [service_accounts](outputs.tf#L78) | Service account created. | |
<!-- END TFDOC -->

View File

@ -47,7 +47,7 @@ Below you can find computed commands to perform steps.
```bash
terraform output -json | jq -r '@sh "export LND_SA=\(.service_accounts.value.landing)\nexport PRC_SA=\(.service_accounts.value.processing)\nexport CMP_SA=\(.service_accounts.value.composer)"' > env.sh
terraform output -json | jq -r '@sh "export LND_GCS=\(.gcs_buckets.value.landing_cs_0)\nexport PRC_GCS=\(.gcs_buckets.value.processing_cs_0)\nexport CMP_GCS=\(.gcs_buckets.value.composer)"' >> env.sh
terraform output -json | jq -r '@sh "export LND_GCS=\(.gcs_buckets.value.landing)\nexport PRC_GCS=\(.gcs_buckets.value.processing)\nexport CUR_GCS=\(.gcs_buckets.value.curated)\nexport CMP_GCS=\(.composer.value.dag_bucket)"' >> env.sh
source ./env.sh

View File

@ -0,0 +1,104 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------------------------------
# Load The Dependencies
# --------------------------------------------------------------------------------
import csv
import datetime
import io
import json
import logging
import os
from airflow import models
from airflow.operators import dummy
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
# --------------------------------------------------------------------------------
# Set variables - Needed for the DEMO
# --------------------------------------------------------------------------------
BQ_LOCATION = os.environ.get("BQ_LOCATION")
CURATED_PRJ = os.environ.get("CURATED_PRJ")
CURATED_BQ_DATASET = os.environ.get("CURATED_BQ_DATASET")
CURATED_GCS = os.environ.get("CURATED_GCS")
LAND_PRJ = os.environ.get("LAND_PRJ")
LAND_GCS = os.environ.get("LAND_GCS")
PROCESSING_GCS = os.environ.get("PROCESSING_GCS")
PROCESSING_SA = os.environ.get("PROCESSING_SA")
PROCESSING_PRJ = os.environ.get("PROCESSING_PRJ")
PROCESSING_SUBNET = os.environ.get("PROCESSING_SUBNET")
PROCESSING_VPC = os.environ.get("PROCESSING_VPC")
DP_KMS_KEY = os.environ.get("DP_KMS_KEY", "")
DP_REGION = os.environ.get("DP_REGION")
DP_ZONE = os.environ.get("DP_REGION") + "-b"
# --------------------------------------------------------------------------------
# Set default arguments
# --------------------------------------------------------------------------------
# If you are running Airflow in more than one time zone
# see https://airflow.apache.org/docs/apache-airflow/stable/timezone.html
# for best practices
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
default_args = {
'owner': 'airflow',
'start_date': yesterday,
'depends_on_past': False,
'email': [''],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
}
# --------------------------------------------------------------------------------
# Main DAG
# --------------------------------------------------------------------------------
with models.DAG(
'bq_gcs2bq',
default_args=default_args,
schedule_interval=None) as dag:
start = dummy.DummyOperator(
task_id='start',
trigger_rule='all_success'
)
end = dummy.DummyOperator(
task_id='end',
trigger_rule='all_success'
)
# Bigquery Tables automatically created for demo porpuse.
# Consider a dedicated pipeline or tool for a real life scenario.
customers_import = GCSToBigQueryOperator(
task_id='csv_to_bigquery',
bucket=LAND_GCS[5:],
source_objects=['customers.csv'],
destination_project_dataset_table='{}:{}.{}'.format(CURATED_PRJ, CURATED_BQ_DATASET, 'customers'),
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_APPEND',
schema_update_options=['ALLOW_FIELD_RELAXATION', 'ALLOW_FIELD_ADDITION'],
schema_object="customers.json",
schema_object_bucket=PROCESSING_GCS[5:],
project_id=PROCESSING_PRJ, # The process will continue to run on the dataset project until the Apache Airflow bug is fixed. https://github.com/apache/airflow/issues/32106
impersonation_chain=[PROCESSING_SA]
)
start >> customers_import >> end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 102 KiB

After

Width:  |  Height:  |  Size: 174 KiB

View File

@ -17,7 +17,8 @@
output "bigquery-datasets" {
description = "BigQuery datasets."
value = {
curated = module.cur-bq-0.dataset_id,
curated = module.cur-bq-0.dataset_id
landing = module.land-bq-0.dataset_id
}
}
@ -25,6 +26,7 @@ output "composer" {
description = "Composer variables."
value = {
air_flow_uri = try(google_composer_environment.processing-cmp-0[0].config.0.airflow_uri, null)
dag_bucket = try(google_composer_environment.processing-cmp-0[0].config[0].dag_gcs_prefix, null)
}
}
@ -36,10 +38,9 @@ output "dataproc-history-server" {
output "gcs_buckets" {
description = "GCS buckets."
value = {
landing_cs_0 = module.land-cs-0.name,
processing_cs_0 = module.processing-cs-0.name,
cur_cs_0 = module.cur-cs-0.name,
composer = try(google_composer_environment.processing-cmp-0[0].config[0].dag_gcs_prefix, null)
curated = module.cur-cs-0.name
landing = module.land-cs-0.name
processing = module.processing-cs-0.name
}
}
@ -48,20 +49,28 @@ output "kms_keys" {
value = var.service_encryption_keys
}
output "network" {
description = "VPC network."
value = {
processing_subnet = local.processing_subnet
processing_vpc = local.processing_vpc
}
}
output "projects" {
description = "GCP Projects informations."
value = {
project_number = {
landing = module.land-project.number,
common = module.common-project.number,
curated = module.cur-project.number,
processing = module.processing-project.number,
common = module.common-project.number
curated = module.cur-project.number
landing = module.land-project.number
processing = module.processing-project.number
}
project_id = {
landing = module.land-project.project_id,
common = module.common-project.project_id,
curated = module.cur-project.project_id,
processing = module.processing-project.project_id,
common = module.common-project.project_id
curated = module.cur-project.project_id
landing = module.land-project.project_id
processing = module.processing-project.project_id
}
}
}
@ -69,24 +78,9 @@ output "projects" {
output "service_accounts" {
description = "Service account created."
value = {
landing = module.land-sa-cs-0.email
processing = module.processing-sa-0.email
composer = module.processing-sa-cmp-0.email
}
}
output "vpc_network" {
description = "VPC network."
value = {
processing_transformation = local.processing_vpc
processing_composer = local.processing_vpc
}
}
output "vpc_subnet" {
description = "VPC subnetworks."
value = {
processing_transformation = local.processing_subnet
processing_composer = local.processing_subnet
curated = module.cur-sa-0.email,
landing = module.land-sa-0.email,
processing = module.processing-sa-0.email,
}
}

View File

@ -98,10 +98,7 @@ variable "network_config" {
type = object({
host_project = optional(string)
network_self_link = optional(string)
subnet_self_links = optional(object({
processing_transformation = string
processing_composer = string
}), null)
subnet_self_link = optional(string)
composer_ip_ranges = optional(object({
connection_subnetwork = optional(string)
cloud_sql = optional(string, "10.20.10.0/24")