Add lineage on Minimal Data Platform blueprint (#1679)

Add lineage on Minimal Data Platform blueprint
This commit is contained in:
lcaggio 2023-09-14 17:52:19 +02:00 committed by GitHub
parent 96a3bc3737
commit 79723f9ce1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 32 additions and 26 deletions

View File

@ -64,6 +64,7 @@ module "land-project" {
"bigquerystorage.googleapis.com",
"cloudkms.googleapis.com",
"cloudresourcemanager.googleapis.com",
"datalineage.googleapis.com",
"iam.googleapis.com",
"serviceusage.googleapis.com",
"stackdriver.googleapis.com",

View File

@ -51,16 +51,20 @@ module "processing-sa-cmp-0" {
}
resource "google_composer_environment" "processing-cmp-0" {
count = var.enable_services.composer == true ? 1 : 0
project = module.processing-project.project_id
name = "${var.prefix}-prc-cmp-0"
region = var.region
count = var.enable_services.composer == true ? 1 : 0
provider = google-beta
project = module.processing-project.project_id
name = "${var.prefix}-prc-cmp-0"
region = var.region
config {
software_config {
airflow_config_overrides = var.composer_config.software_config.airflow_config_overrides
pypi_packages = var.composer_config.software_config.pypi_packages
env_variables = local.env_variables
image_version = var.composer_config.software_config.image_version
cloud_data_lineage_integration {
enabled = var.composer_config.software_config.cloud_data_lineage_integration
}
}
workloads_config {
scheduler {

View File

@ -118,6 +118,7 @@ module "processing-project" {
"compute.googleapis.com",
"container.googleapis.com",
"dataflow.googleapis.com",
"datalineage.googleapis.com",
"dataproc.googleapis.com",
"iam.googleapis.com",
"servicenetworking.googleapis.com",

View File

@ -22,6 +22,7 @@ locals {
"cloudkms.googleapis.com",
"cloudresourcemanager.googleapis.com",
"compute.googleapis.com",
"datalineage.googleapis.com",
"iam.googleapis.com",
"servicenetworking.googleapis.com",
"serviceusage.googleapis.com",

View File

@ -229,7 +229,7 @@ module "data-platform" {
prefix = "myprefix"
}
# tftest modules=23 resources=135
# tftest modules=23 resources=138
```
## Customizations
@ -302,19 +302,19 @@ The application layer is out of scope of this script. As a demo purpuse only, on
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| [organization_domain](variables.tf#L122) | Organization domain. | <code>string</code> | ✓ | |
| [prefix](variables.tf#L127) | Prefix used for resource names. | <code>string</code> | ✓ | |
| [project_config](variables.tf#L136) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | <code title="object&#40;&#123;&#10; billing_account_id &#61; optional&#40;string, null&#41;&#10; parent &#61; string&#10; project_ids &#61; optional&#40;object&#40;&#123;&#10; landing &#61; string&#10; processing &#61; string&#10; curated &#61; string&#10; common &#61; string&#10; &#125;&#41;, &#123;&#10; landing &#61; &#34;lnd&#34;&#10; processing &#61; &#34;prc&#34;&#10; curated &#61; &#34;cur&#34;&#10; common &#61; &#34;cmn&#34;&#10; &#125;&#10; &#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| [composer_config](variables.tf#L17) | Cloud Composer config. | <code title="object&#40;&#123;&#10; environment_size &#61; optional&#40;string, &#34;ENVIRONMENT_SIZE_SMALL&#34;&#41;&#10; software_config &#61; optional&#40;object&#40;&#123;&#10; airflow_config_overrides &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; pypi_packages &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; env_variables &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; image_version &#61; optional&#40;string, &#34;composer-2-airflow-2&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; web_server_access_control &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; workloads_config &#61; optional&#40;object&#40;&#123;&#10; scheduler &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; count &#61; optional&#40;number, 1&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; web_server &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; worker &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; min_count &#61; optional&#40;number, 1&#41;&#10; max_count &#61; optional&#40;number, 3&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [data_catalog_tags](variables.tf#L55) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | <code title="map&#40;object&#40;&#123;&#10; description &#61; optional&#40;string&#41;&#10; iam &#61; optional&#40;map&#40;list&#40;string&#41;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;&#41;">map&#40;object&#40;&#123;&#8230;&#125;&#41;&#41;</code> | | <code title="&#123;&#10; &#34;3_Confidential&#34; &#61; &#123;&#125;&#10; &#34;2_Private&#34; &#61; &#123;&#125;&#10; &#34;1_Sensitive&#34; &#61; &#123;&#125;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [data_force_destroy](variables.tf#L69) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | <code>bool</code> | | <code>false</code> |
| [enable_services](variables.tf#L75) | Flag to enable or disable services in the Data Platform. | <code title="object&#40;&#123;&#10; composer &#61; optional&#40;bool, true&#41;&#10; dataproc_history_server &#61; optional&#40;bool, true&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [groups](variables.tf#L84) | User groups. | <code>map&#40;string&#41;</code> | | <code title="&#123;&#10; data-analysts &#61; &#34;gcp-data-analysts&#34;&#10; data-engineers &#61; &#34;gcp-data-engineers&#34;&#10; data-security &#61; &#34;gcp-data-security&#34;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [location](variables.tf#L94) | Location used for multi-regional resources. | <code>string</code> | | <code>&#34;eu&#34;</code> |
| [network_config](variables.tf#L100) | Shared VPC network configurations to use. If null networks will be created in projects. | <code title="object&#40;&#123;&#10; host_project &#61; optional&#40;string&#41;&#10; network_self_link &#61; optional&#40;string&#41;&#10; subnet_self_link &#61; optional&#40;string&#41;&#10; composer_ip_ranges &#61; optional&#40;object&#40;&#123;&#10; connection_subnetwork &#61; optional&#40;string&#41;&#10; cloud_sql &#61; optional&#40;string, &#34;10.20.10.0&#47;24&#34;&#41;&#10; gke_master &#61; optional&#40;string, &#34;10.20.11.0&#47;28&#34;&#41;&#10; pods_range_name &#61; optional&#40;string, &#34;pods&#34;&#41;&#10; services_range_name &#61; optional&#40;string, &#34;services&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [project_suffix](variables.tf#L160) | Suffix used only for project ids. | <code>string</code> | | <code>null</code> |
| [region](variables.tf#L166) | Region used for regional resources. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| [service_encryption_keys](variables.tf#L172) | Cloud KMS to use to encrypt different services. Key location should match service region. | <code title="object&#40;&#123;&#10; bq &#61; optional&#40;string&#41;&#10; composer &#61; optional&#40;string&#41;&#10; compute &#61; optional&#40;string&#41;&#10; storage &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [organization_domain](variables.tf#L123) | Organization domain. | <code>string</code> | ✓ | |
| [prefix](variables.tf#L128) | Prefix used for resource names. | <code>string</code> | ✓ | |
| [project_config](variables.tf#L137) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | <code title="object&#40;&#123;&#10; billing_account_id &#61; optional&#40;string, null&#41;&#10; parent &#61; string&#10; project_ids &#61; optional&#40;object&#40;&#123;&#10; landing &#61; string&#10; processing &#61; string&#10; curated &#61; string&#10; common &#61; string&#10; &#125;&#41;, &#123;&#10; landing &#61; &#34;lnd&#34;&#10; processing &#61; &#34;prc&#34;&#10; curated &#61; &#34;cur&#34;&#10; common &#61; &#34;cmn&#34;&#10; &#125;&#10; &#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| [composer_config](variables.tf#L17) | Cloud Composer config. | <code title="object&#40;&#123;&#10; environment_size &#61; optional&#40;string, &#34;ENVIRONMENT_SIZE_SMALL&#34;&#41;&#10; software_config &#61; optional&#40;object&#40;&#123;&#10; airflow_config_overrides &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; pypi_packages &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; env_variables &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; image_version &#61; optional&#40;string, &#34;composer-2-airflow-2&#34;&#41;&#10; cloud_data_lineage_integration &#61; optional&#40;bool, true&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; web_server_access_control &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; workloads_config &#61; optional&#40;object&#40;&#123;&#10; scheduler &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; count &#61; optional&#40;number, 1&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; web_server &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; worker &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; min_count &#61; optional&#40;number, 1&#41;&#10; max_count &#61; optional&#40;number, 3&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [data_catalog_tags](variables.tf#L56) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | <code title="map&#40;object&#40;&#123;&#10; description &#61; optional&#40;string&#41;&#10; iam &#61; optional&#40;map&#40;list&#40;string&#41;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;&#41;">map&#40;object&#40;&#123;&#8230;&#125;&#41;&#41;</code> | | <code title="&#123;&#10; &#34;3_Confidential&#34; &#61; &#123;&#125;&#10; &#34;2_Private&#34; &#61; &#123;&#125;&#10; &#34;1_Sensitive&#34; &#61; &#123;&#125;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [data_force_destroy](variables.tf#L70) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | <code>bool</code> | | <code>false</code> |
| [enable_services](variables.tf#L76) | Flag to enable or disable services in the Data Platform. | <code title="object&#40;&#123;&#10; composer &#61; optional&#40;bool, true&#41;&#10; dataproc_history_server &#61; optional&#40;bool, true&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [groups](variables.tf#L85) | User groups. | <code>map&#40;string&#41;</code> | | <code title="&#123;&#10; data-analysts &#61; &#34;gcp-data-analysts&#34;&#10; data-engineers &#61; &#34;gcp-data-engineers&#34;&#10; data-security &#61; &#34;gcp-data-security&#34;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [location](variables.tf#L95) | Location used for multi-regional resources. | <code>string</code> | | <code>&#34;eu&#34;</code> |
| [network_config](variables.tf#L101) | Shared VPC network configurations to use. If null networks will be created in projects. | <code title="object&#40;&#123;&#10; host_project &#61; optional&#40;string&#41;&#10; network_self_link &#61; optional&#40;string&#41;&#10; subnet_self_link &#61; optional&#40;string&#41;&#10; composer_ip_ranges &#61; optional&#40;object&#40;&#123;&#10; connection_subnetwork &#61; optional&#40;string&#41;&#10; cloud_sql &#61; optional&#40;string, &#34;10.20.10.0&#47;24&#34;&#41;&#10; gke_master &#61; optional&#40;string, &#34;10.20.11.0&#47;28&#34;&#41;&#10; pods_range_name &#61; optional&#40;string, &#34;pods&#34;&#41;&#10; services_range_name &#61; optional&#40;string, &#34;services&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [project_suffix](variables.tf#L161) | Suffix used only for project ids. | <code>string</code> | | <code>null</code> |
| [region](variables.tf#L167) | Region used for regional resources. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| [service_encryption_keys](variables.tf#L173) | Cloud KMS to use to encrypt different services. Key location should match service region. | <code title="object&#40;&#123;&#10; bq &#61; optional&#40;string&#41;&#10; composer &#61; optional&#40;string&#41;&#10; compute &#61; optional&#40;string&#41;&#10; storage &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
## Outputs

View File

@ -54,5 +54,5 @@ source ./env.sh
gsutil -i $LND_SA cp demo/data/*.csv gs://$LND_GCS
gsutil -i $CMP_SA cp demo/data/*.j* gs://$PRC_GCS
gsutil -i $CMP_SA cp demo/pyspark_* gs://$PRC_GCS
gsutil -i $CMP_SA cp demo/dag_*.py $CMP_GCS
gsutil -i $CMP_SA cp demo/dag_*.py gs://$CMP_GCS/dags
```

View File

@ -92,7 +92,7 @@ with models.DAG(
schema_update_options=['ALLOW_FIELD_RELAXATION', 'ALLOW_FIELD_ADDITION'],
schema_object="customers.json",
schema_object_bucket=PROCESSING_GCS[5:],
project_id=PROCESSING_PRJ, # The process will continue to run on the dataset project until the Apache Airflow bug is fixed. https://github.com/apache/airflow/issues/32106
project_id=PROCESSING_PRJ,
impersonation_chain=[PROCESSING_SA]
)

View File

@ -34,7 +34,6 @@ CURATED_GCS = Variable.get("CURATED_GCS")
CURATED_PRJ = Variable.get("CURATED_PRJ")
DP_KMS_KEY = Variable.get("DP_KMS_KEY", "")
DP_REGION = Variable.get("DP_REGION")
GCP_REGION = Variable.get("GCP_REGION")
LAND_PRJ = Variable.get("LAND_PRJ")
LAND_BQ_DATASET = Variable.get("LAND_BQ_DATASET")
LAND_GCS = Variable.get("LAND_GCS")

View File

@ -33,7 +33,6 @@ CURATED_GCS = Variable.get("CURATED_GCS")
CURATED_PRJ = Variable.get("CURATED_PRJ")
DP_KMS_KEY = Variable.get("DP_KMS_KEY", "")
DP_REGION = Variable.get("DP_REGION")
GCP_REGION = Variable.get("GCP_REGION")
LAND_PRJ = Variable.get("LAND_PRJ")
LAND_BQ_DATASET = Variable.get("LAND_BQ_DATASET")
LAND_GCS = Variable.get("LAND_GCS")

View File

@ -19,10 +19,11 @@ variable "composer_config" {
type = object({
environment_size = optional(string, "ENVIRONMENT_SIZE_SMALL")
software_config = optional(object({
airflow_config_overrides = optional(map(string), {})
pypi_packages = optional(map(string), {})
env_variables = optional(map(string), {})
image_version = optional(string, "composer-2-airflow-2")
airflow_config_overrides = optional(map(string), {})
pypi_packages = optional(map(string), {})
env_variables = optional(map(string), {})
image_version = optional(string, "composer-2-airflow-2")
cloud_data_lineage_integration = optional(bool, true)
}), {})
web_server_access_control = optional(map(string), {})
workloads_config = optional(object({