Merge branch 'master' into gcs2bq-least-privileges

This commit is contained in:
Ludovico Magnocavallo 2021-12-24 11:36:04 +01:00 committed by GitHub
commit eaae34b623
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 285 additions and 28 deletions

View File

@ -36,7 +36,21 @@ Once done testing, you can clean up resources by running `terraform destroy`. To
Once resources are created, you can run queries on the data you exported on Bigquery. [Here](https://cloud.google.com/asset-inventory/docs/exporting-to-bigquery#querying_an_asset_snapshot) you can find some example of queries you can run.
You can also create a dashboard connecting [Datalab](https://datastudio.google.com/) or any other BI tools of your choice to your Bigquery datase.
You can also create a dashboard connecting [Datalab](https://datastudio.google.com/) or any other BI tools of your choice to your Bigquery dataset.
## File exporter for JSON, CSV (optional).
This is an optional part.
Regular file-based exports of data from Cloud Asset Inventory may be useful for e.g. scale-out network dependencies discovery tools like [Planet Exporter](https://github.com/williamchanrico/planet-exporter), or to update legacy workloads tracking or configuration management systems. Bigquery supports multiple [export formats](https://cloud.google.com/bigquery/docs/exporting-data#export_formats_and_compression_types) and one may upload objects to Storage Bucket using provided Cloud Function. Specify `job.DestinationFormat` as defined in [documentation](https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.job.DestinationFormat.html), e.g. `NEWLINE_DELIMITED_JSON`.
It helps to create custom [scheduled query](https://cloud.google.com/bigquery/docs/scheduling-queries#console) from CAI export tables, and to write out results in to dedicated table (with overwrites). Define such query's output columns to comply with downstream systems' fields requirements, and time query execution after CAI export into BQ for freshness. See [sample queries](https://cloud.google.com/asset-inventory/docs/exporting-to-bigquery-sample-queries).
This is an optional part, created if `cai_gcs_export` is set to `true`. The high level diagram extends to the following:
<img src="diagram_optional.png" width="640px">
<!-- BEGIN TFDOC -->
@ -44,12 +58,16 @@ You can also create a dashboard connecting [Datalab](https://datastudio.google.c
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| cai_config | Cloud Asset inventory export config. | <code title="object&#40;&#123;&#10; bq_dataset &#61; string&#10; bq_table &#61; string&#10; target_node &#61; string&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| cai_config | Cloud Asset Inventory export config. | <code title="object&#40;&#123;&#10; bq_dataset &#61; string&#10; bq_table &#61; string&#10; bq_table_overwrite &#61; bool&#10; target_node &#61; string&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| project_id | Project id that references existing project. | <code>string</code> | ✓ | |
| billing_account | Billing account id used as default for new projects. | <code>string</code> | | <code>null</code> |
| bundle_path | Path used to write the intermediate Cloud Function code bundle. | <code>string</code> | | <code>&#34;.&#47;bundle.zip&#34;</code> |
| bundle_path_cffile | Path used to write the intermediate Cloud Function code bundle. | <code>string</code> | | <code>&#34;.&#47;bundle_cffile.zip&#34;</code> |
| cai_gcs_export | Enable optional part to export tables to GCS | <code>bool</code> | | <code>false</code> |
| file_config | Optional BQ table as a file export function config. | <code title="object&#40;&#123;&#10; bucket &#61; string&#10; filename &#61; string&#10; format &#61; string&#10; bq_dataset &#61; string&#10; bq_table &#61; string&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code title="&#123;&#10; bucket &#61; null&#10; filename &#61; null&#10; format &#61; null&#10; bq_dataset &#61; null&#10; bq_table &#61; null&#10;&#125;">&#123;&#8230;&#125;</code> |
| location | Appe Engine location used in the example. | <code>string</code> | | <code>&#34;europe-west&#34;</code> |
| name | Arbitrary string used to name created resources. | <code>string</code> | | <code>&#34;asset-inventory&#34;</code> |
| name_cffile | Arbitrary string used to name created resources. | <code>string</code> | | <code>&#34;cffile-exporter&#34;</code> |
| project_create | Create project instead ofusing an existing one. | <code>bool</code> | | <code>true</code> |
| region | Compute region used in the example. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| root_node | The resource name of the parent folder or organization for project creation, in 'folders/folder_id' or 'organizations/org_id' format. | <code>string</code> | | <code>null</code> |
@ -63,3 +81,4 @@ You can also create a dashboard connecting [Datalab](https://datastudio.google.c
<!-- END TFDOC -->

View File

@ -50,18 +50,19 @@ def _configure_logging(verbose=True):
@click.option('--bq-project', required=True, help='Bigquery project to use.')
@click.option('--bq-dataset', required=True, help='Bigquery dataset to use.')
@click.option('--bq-table', required=True, help='Bigquery table name to use.')
@click.option('--bq-table-overwrite', required=True, help='Overwrite existing BQ table or create new datetime() one.')
@click.option('--target-node', required=True, help='Node in Google Cloud resource hierarchy.')
@click.option('--read-time', required=False, help=(
'Day to take an asset snapshot in \'YYYYMMDD\' format, uses current day '
' as default. Export will run at midnight of the specified day.'))
@click.option('--verbose', is_flag=True, help='Verbose output')
def main_cli(project=None, bq_project=None, bq_dataset=None, bq_table=None, target_node=None,
def main_cli(project=None, bq_project=None, bq_dataset=None, bq_table=None, bq_table_overwrite=None, target_node=None,
read_time=None, verbose=False):
'''Trigger Cloud Asset inventory export to Bigquery. Data will be stored in
the dataset specified on a dated table with the name specified.
'''
try:
_main(project, bq_project, bq_dataset, bq_table, target_node, read_time, verbose)
_main(project, bq_project, bq_dataset, bq_table, bq_table_overwrite, target_node, read_time, verbose)
except RuntimeError:
logging.exception('exception raised')
@ -79,19 +80,22 @@ def main(event, context):
logging.exception('exception in cloud function entry point')
def _main(project=None, bq_project=None, bq_dataset=None, bq_table=None, target_node=None, read_time=None, verbose=False):
def _main(project=None, bq_project=None, bq_dataset=None, bq_table=None, bq_table_overwrite=None, target_node=None, read_time=None, verbose=False):
'Module entry point used by cli and cloud function wrappers.'
_configure_logging(verbose)
if not read_time:
read_time = datetime.datetime.now()
client = asset_v1.AssetServiceClient()
content_type = asset_v1.ContentType.RESOURCE
output_config = asset_v1.OutputConfig()
client = asset_v1.AssetServiceClient()
if bq_table_overwrite == False:
read_time = datetime.datetime.now()
output_config.bigquery_destination.table = '%s_%s' % (
bq_table, read_time.strftime('%Y%m%d'))
else:
output_config.bigquery_destination.table = '%s_latest' % (
bq_table)
content_type = asset_v1.ContentType.RESOURCE
output_config.bigquery_destination.dataset = 'projects/%s/datasets/%s' % (
bq_project, bq_dataset)
output_config.bigquery_destination.table = '%s_%s' % (
bq_table, read_time.strftime('%Y%m%d'))
output_config.bigquery_destination.separate_tables_per_asset_type = True
output_config.bigquery_destination.force = True
try:

View File

@ -0,0 +1,99 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''Cloud Function module to export BQ table as JSON.
This module is designed to be plugged in a Cloud Function, attached to Cloud
Scheduler trigger to create a JSON of IP to hostname mappings from BigQuery.
'''
import base64
import datetime
import json
import logging
import os
import warnings
from google.api_core.exceptions import GoogleAPIError
from google.cloud import bigquery
import click
import googleapiclient.discovery
import googleapiclient.errors
def _configure_logging(verbose=True):
'''Basic logging configuration.
Args:
verbose: enable verbose logging
'''
level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=level)
warnings.filterwarnings('ignore', r'.*end user credentials.*', UserWarning)
@click.command()
@click.option('--bucket', required=True, help='GCS bucket for export')
@click.option('--filename', required=True, help='Path and filename with extension to export e.g. folder/export.json .')
@click.option('--format', required=True, help='The exported file format, e.g. NEWLINE_DELIMITED_JSON or CSV.')
@click.option('--bq-dataset', required=True, help='Bigquery dataset where table for export is located.')
@click.option('--bq-table', required=True, help='Bigquery table to export.')
@click.option('--verbose', is_flag=True, help='Verbose output')
def main_cli(bucket=None, filename=None, format=None, bq_dataset=None, bq_table=None, verbose=False):
'''Trigger Cloud Asset inventory export from Bigquery to file. Data will be stored in
the dataset specified on a dated table with the name specified.
'''
try:
_main(bucket, filename, format, bq_dataset, bq_table, verbose)
except RuntimeError:
logging.exception('exception raised')
def main(event, context):
'Cloud Function entry point.'
try:
data = json.loads(base64.b64decode(event['data']).decode('utf-8'))
print(data)
_main(**data)
# uncomment once https://issuetracker.google.com/issues/155215191 is fixed
# except RuntimeError:
# raise
except Exception:
logging.exception('exception in cloud function entry point')
def _main(bucket=None, filename=None, format=None, bq_dataset=None, bq_table=None, verbose=False):
'Module entry point used by cli and cloud function wrappers.'
_configure_logging(verbose)
client = bigquery.Client()
destination_uri = 'gs://{}/{}'.format(bucket, filename)
dataset_ref = client.dataset(bq_dataset)
table_ref = dataset_ref.table(bq_table)
job_config = bigquery.job.ExtractJobConfig()
job_config.destination_format = (
getattr(bigquery.DestinationFormat, format) )
extract_job = client.extract_table(
table_ref, destination_uri, job_config=job_config
)
try:
extract_job.result()
except (GoogleAPIError, googleapiclient.errors.HttpError) as e:
logging.debug('API Error: %s', e, exc_info=True)
raise RuntimeError(
'Error exporting BQ table %s as a file' % bq_table, e)
if __name__ == '__main__':
main_cli()

View File

@ -0,0 +1,3 @@
google-api-python-client>=1.10.1
google-cloud-monitoring>=1.1.0
google-cloud-bigquery

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

View File

@ -14,6 +14,8 @@
* limitations under the License.
*/
###############################################################################
# Projects #
###############################################################################
@ -47,6 +49,7 @@ module "service-account" {
iam_project_roles = {
(var.project_id) = [
"roles/cloudasset.owner",
"roles/bigquery.jobUser"
]
}
}
@ -66,6 +69,17 @@ module "pubsub" {
# at the project level via roles/cloudscheduler.serviceAgent
}
module "pubsub_file" {
source = "../../modules/pubsub"
project_id = module.project.project_id
name = var.name_cffile
subscriptions = {
"${var.name_cffile}-default" = null
}
# the Cloud Scheduler robot service account already has pubsub.topics.publish
# at the project level via roles/cloudscheduler.serviceAgent
}
###############################################################################
# Cloud Function #
###############################################################################
@ -93,6 +107,30 @@ module "cf" {
}
}
module "cffile" {
count = var.cai_gcs_export ? 1 : 0
source = "../../modules/cloud-function"
project_id = module.project.project_id
region = var.region
name = var.name_cffile
bucket_name = "${var.name_cffile}-${random_pet.random.id}"
bucket_config = {
location = var.region
lifecycle_delete_age = null
}
bundle_config = {
source_dir = "cffile"
output_path = var.bundle_path_cffile
excludes = null
}
service_account = module.service-account.email
trigger_config = {
event = "google.pubsub.topic.publish"
resource = module.pubsub_file.topic.id
retry = null
}
}
resource "random_pet" "random" {
length = 1
}
@ -118,11 +156,34 @@ resource "google_cloud_scheduler_job" "job" {
attributes = {}
topic_name = module.pubsub.topic.id
data = base64encode(jsonencode({
project = module.project.project_id
bq_project = module.project.project_id
bq_dataset = var.cai_config.bq_dataset
bq_table = var.cai_config.bq_table
target_node = var.cai_config.target_node
project = module.project.project_id
bq_project = module.project.project_id
bq_dataset = var.cai_config.bq_dataset
bq_table = var.cai_config.bq_table
bq_table_overwrite = var.cai_config.bq_table_overwrite
target_node = var.cai_config.target_node
}))
}
}
resource "google_cloud_scheduler_job" "job_file" {
count = var.cai_gcs_export ? 1 : 0
project = google_app_engine_application.app.project
region = var.region
name = "file-export-job"
description = "File export from BQ Job"
schedule = "* 9 * * 1"
time_zone = "Etc/UTC"
pubsub_target {
attributes = {}
topic_name = module.pubsub_file.topic.id
data = base64encode(jsonencode({
bucket = var.file_config.bucket
filename = var.file_config.filename
format = var.file_config.format
bq_dataset = var.file_config.bq_dataset
bq_table = var.file_config.bq_table
}))
}
}

View File

@ -26,15 +26,50 @@ variable "bundle_path" {
default = "./bundle.zip"
}
variable "bundle_path_cffile" {
description = "Path used to write the intermediate Cloud Function code bundle."
type = string
default = "./bundle_cffile.zip"
}
variable "cai_config" {
description = "Cloud Asset inventory export config."
description = "Cloud Asset Inventory export config."
type = object({
bq_dataset = string
bq_table = string
target_node = string
bq_dataset = string
bq_table = string
bq_table_overwrite = bool
target_node = string
})
}
variable "cai_gcs_export" {
description = "Enable optional part to export tables to GCS"
type = bool
default = false
}
variable "file_config" {
description = "Optional BQ table as a file export function config."
type = object({
bucket = string
filename = string
format = string
bq_dataset = string
bq_table = string
})
default = {
bucket = null
filename = null
format = null
bq_dataset = null
bq_table = null
}
}
variable "location" {
description = "Appe Engine location used in the example."
type = string
@ -48,6 +83,15 @@ variable "name" {
default = "asset-inventory"
}
variable "name_cffile" {
description = "Arbitrary string used to name created resources."
type = string
default = "cffile-exporter"
}
variable "project_create" {
description = "Create project instead ofusing an existing one."
type = bool

View File

@ -18,6 +18,8 @@ module "test" {
source = "../../../../cloud-operations/scheduled-asset-inventory-export-bq"
billing_account = var.billing_account
cai_config = var.cai_config
cai_gcs_export = var.cai_gcs_export
file_config = var.file_config
project_create = var.project_create
project_id = var.project_id
}

View File

@ -19,17 +19,42 @@ variable "billing_account" {
variable "cai_config" {
type = object({
bq_dataset = string
bq_table = string
target_node = string
bq_dataset = string
bq_table = string
bq_table_overwrite = bool
target_node = string
})
default = {
bq_dataset = "my-dataset"
bq_table = "my_table"
target_node = "organization/1234567890"
bq_dataset = "my-dataset"
bq_table = "my_table"
bq_table_overwrite = "true"
target_node = "organization/1234567890"
}
}
variable "cai_gcs_export" {
type = bool
default = true
}
variable "file_config" {
type = object({
bucket = string
filename = string
format = string
bq_dataset = string
bq_table = string
})
default = {
bucket = "my-bucket"
filename = "my-folder/myfile.json"
format = "NEWLINE_DELIMITED_JSON"
bq_dataset = "my-dataset"
bq_table = "my_table"
}
}
variable "project_create" {
type = bool
default = true

View File

@ -23,5 +23,5 @@ FIXTURES_DIR = os.path.join(os.path.dirname(__file__), 'fixture')
def test_resources(e2e_plan_runner):
"Test that plan works and the numbers of resources is as expected."
modules, resources = e2e_plan_runner(FIXTURES_DIR)
assert len(modules) == 5
assert len(resources) == 23
assert len(modules) == 7
assert len(resources) == 29