From 3123d67ddfb7f0dfe02d6363ebd847cebbeceb2d Mon Sep 17 00:00:00 2001 From: Giorgio Conte Date: Mon, 6 Mar 2023 12:28:58 +0000 Subject: [PATCH] moved sql to notebook --- .../bq-ml/demo/bmql_pipeline.ipynb | 120 +++++++++++++++++- .../bq-ml/demo/sql/explain_predict.sql | 23 ---- .../bq-ml/demo/sql/features.sql | 68 ---------- .../data-solutions/bq-ml/demo/sql/train.sql | 27 ---- 4 files changed, 113 insertions(+), 125 deletions(-) delete mode 100644 blueprints/data-solutions/bq-ml/demo/sql/explain_predict.sql delete mode 100644 blueprints/data-solutions/bq-ml/demo/sql/features.sql delete mode 100644 blueprints/data-solutions/bq-ml/demo/sql/train.sql diff --git a/blueprints/data-solutions/bq-ml/demo/bmql_pipeline.ipynb b/blueprints/data-solutions/bq-ml/demo/bmql_pipeline.ipynb index ff063957..4d3f5b53 100644 --- a/blueprints/data-solutions/bq-ml/demo/bmql_pipeline.ipynb +++ b/blueprints/data-solutions/bq-ml/demo/bmql_pipeline.ipynb @@ -93,6 +93,100 @@ "source": [ "# Vertex AI Pipeline Definition\n", "\n", + "Let's first define the queries for the features and target creation and the query to train the model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this query creates the features for our model and the target value we would like to predict\n", + "\n", + "features_query = \"\"\"\n", + "CREATE VIEW if NOT EXISTS `{project_id}.{dataset}.ecommerce_abt` AS\n", + "WITH abt AS (\n", + " SELECT user_id,\n", + " session_id,\n", + " city,\n", + " postal_code,\n", + " browser,\n", + " traffic_source,\n", + " min(created_at) AS session_starting_ts,\n", + " sum(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) has_purchased\n", + " FROM `bigquery-public-data.thelook_ecommerce.events` \n", + " GROUP BY user_id,\n", + " session_id,\n", + " city,\n", + " postal_code,\n", + " browser,\n", + " traffic_source\n", + "), previous_orders AS (\n", + " SELECT user_id,\n", + " array_agg (struct(created_at AS order_creations_ts,\n", + " o.order_id,\n", + " o.status,\n", + " oi.order_cost)) as user_orders\n", + " FROM `bigquery-public-data.thelook_ecommerce.orders` o\n", + " JOIN (SELECT order_id,\n", + " sum(sale_price) order_cost \n", + " FROM `bigquery-public-data.thelook_ecommerce.order_items`\n", + " GROUP BY 1) oi\n", + " ON o.order_id = oi.order_id\n", + " GROUP BY 1\n", + ")\n", + "SELECT abt.*,\n", + " CASE WHEN extract(DAYOFWEEK FROM session_starting_ts) IN (1,7)\n", + " THEN 'WEEKEND' \n", + " ELSE 'WEEKDAY'\n", + " END AS day_of_week,\n", + " extract(HOUR FROM session_starting_ts) hour_of_day,\n", + " (SELECT count(DISTINCT uo.order_id) \n", + " FROM unnest(user_orders) uo \n", + " WHERE uo.order_creations_ts < session_starting_ts \n", + " AND status IN ('Shipped', 'Complete', 'Processing')) AS number_of_successful_orders,\n", + " IFNULL((SELECT sum(DISTINCT uo.order_cost) \n", + " FROM unnest(user_orders) uo \n", + " WHERE uo.order_creations_ts < session_starting_ts \n", + " AND status IN ('Shipped', 'Complete', 'Processing')), 0) AS sum_previous_orders,\n", + " (SELECT count(DISTINCT uo.order_id) \n", + " FROM unnest(user_orders) uo \n", + " WHERE uo.order_creations_ts < session_starting_ts \n", + " AND status IN ('Cancelled', 'Returned')) AS number_of_unsuccessful_orders\n", + "FROM abt \n", + "LEFT JOIN previous_orders pso \n", + "ON abt.user_id = pso.user_id\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this query create the train job on BQ ML\n", + "train_query = \"\"\"\n", + "CREATE OR REPLACE MODEL `{project_id}.{dataset}.{model_name}`\n", + "OPTIONS(MODEL_TYPE='{model_type}',\n", + " INPUT_LABEL_COLS=['has_purchased'],\n", + " ENABLE_GLOBAL_EXPLAIN=TRUE,\n", + " MODEL_REGISTRY='VERTEX_AI',\n", + " DATA_SPLIT_METHOD = 'RANDOM',\n", + " DATA_SPLIT_EVAL_FRACTION = {split_fraction}\n", + " ) AS \n", + "SELECT * EXCEPT (session_id, session_starting_ts, user_id) \n", + "FROM `{project_id}.{dataset}.ecommerce_abt`\n", + "WHERE extract(ISOYEAR FROM session_starting_ts) = 2022\n", + "\"\"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ "In the following code block, we are defining our Vertex AI pipeline. It is made up of three main steps:\n", "1. Create a BigQuery dataset that will contain the BigQuery ML models\n", "2. Train the BigQuery ML model, in this case, a logistic regression\n", @@ -113,13 +207,6 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"sql/train.sql\") as file:\n", - " train_query = file.read()\n", - "\n", - "with open(\"sql/features.sql\") as file:\n", - " features_query = file.read()\n", - "\n", - "\n", "@kfp.dsl.pipeline(name='bqml-pipeline', pipeline_root=PIPELINE_ROOT)\n", "def pipeline(\n", " model_name: str,\n", @@ -294,6 +381,25 @@ "my_prediction" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# batch prediction on BigQuery\n", + "\n", + "explain_predict_query = \"\"\"\n", + "SELECT *\n", + "FROM ML.EXPLAIN_PREDICT(MODEL `{project_id}.{dataset}.{model_name}`,\n", + " (SELECT * EXCEPT (session_id, session_starting_ts, user_id, has_purchased) \n", + " FROM `{project_id}.{dataset}.ecommerce_abt`\n", + " WHERE extract(ISOYEAR FROM session_starting_ts) = 2023),\n", + " STRUCT(5 AS top_k_features, 0.5 AS threshold))\n", + "LIMIT 100\n", + "\"\"\"" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/blueprints/data-solutions/bq-ml/demo/sql/explain_predict.sql b/blueprints/data-solutions/bq-ml/demo/sql/explain_predict.sql deleted file mode 100644 index e09fbd94..00000000 --- a/blueprints/data-solutions/bq-ml/demo/sql/explain_predict.sql +++ /dev/null @@ -1,23 +0,0 @@ -/* -* Copyright 2023 Google LLC -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -SELECT * -FROM ML.EXPLAIN_PREDICT(MODEL `{project_id}.{dataset}.{model_name}`, - (SELECT * EXCEPT (session_id, session_starting_ts, user_id, has_purchased) - FROM `{project_id}.{dataset}.ecommerce_abt` - WHERE extract(ISOYEAR FROM session_starting_ts) = 2023), - STRUCT(5 AS top_k_features, 0.5 AS threshold)) -LIMIT 100 diff --git a/blueprints/data-solutions/bq-ml/demo/sql/features.sql b/blueprints/data-solutions/bq-ml/demo/sql/features.sql deleted file mode 100644 index a28ba85b..00000000 --- a/blueprints/data-solutions/bq-ml/demo/sql/features.sql +++ /dev/null @@ -1,68 +0,0 @@ -/* -* Copyright 2023 Google LLC -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -CREATE VIEW if NOT EXISTS `{project_id}.{dataset}.ecommerce_abt` AS -WITH abt AS ( - SELECT user_id, - session_id, - city, - postal_code, - browser, - traffic_source, - min(created_at) AS session_starting_ts, - sum(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) has_purchased - FROM `bigquery-public-data.thelook_ecommerce.events` - GROUP BY user_id, - session_id, - city, - postal_code, - browser, - traffic_source -), previous_orders AS ( - SELECT user_id, - array_agg (struct(created_at AS order_creations_ts, - o.order_id, - o.status, - oi.order_cost)) as user_orders - FROM `bigquery-public-data.thelook_ecommerce.orders` o - JOIN (SELECT order_id, - sum(sale_price) order_cost - FROM `bigquery-public-data.thelook_ecommerce.order_items` - GROUP BY 1) oi - ON o.order_id = oi.order_id - GROUP BY 1 -) -SELECT abt.*, - CASE WHEN extract(DAYOFWEEK FROM session_starting_ts) IN (1,7) - THEN 'WEEKEND' - ELSE 'WEEKDAY' - END AS day_of_week, - extract(HOUR FROM session_starting_ts) hour_of_day, - (SELECT count(DISTINCT uo.order_id) - FROM unnest(user_orders) uo - WHERE uo.order_creations_ts < session_starting_ts - AND status IN ('Shipped', 'Complete', 'Processing')) AS number_of_successful_orders, - IFNULL((SELECT sum(DISTINCT uo.order_cost) - FROM unnest(user_orders) uo - WHERE uo.order_creations_ts < session_starting_ts - AND status IN ('Shipped', 'Complete', 'Processing')), 0) AS sum_previous_orders, - (SELECT count(DISTINCT uo.order_id) - FROM unnest(user_orders) uo - WHERE uo.order_creations_ts < session_starting_ts - AND status IN ('Cancelled', 'Returned')) AS number_of_unsuccessful_orders -FROM abt -LEFT JOIN previous_orders pso -ON abt.user_id = pso.user_id diff --git a/blueprints/data-solutions/bq-ml/demo/sql/train.sql b/blueprints/data-solutions/bq-ml/demo/sql/train.sql deleted file mode 100644 index 2c30f2e6..00000000 --- a/blueprints/data-solutions/bq-ml/demo/sql/train.sql +++ /dev/null @@ -1,27 +0,0 @@ -/* -* Copyright 2023 Google LLC -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* https://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -CREATE OR REPLACE MODEL `{project_id}.{dataset}.{model_name}` -OPTIONS(MODEL_TYPE='{model_type}', - INPUT_LABEL_COLS=['has_purchased'], - ENABLE_GLOBAL_EXPLAIN=TRUE, - MODEL_REGISTRY='VERTEX_AI', - DATA_SPLIT_METHOD = 'RANDOM', - DATA_SPLIT_EVAL_FRACTION = {split_fraction} - ) AS -SELECT * EXCEPT (session_id, session_starting_ts, user_id) -FROM `{project_id}.{dataset}.ecommerce_abt_table` -WHERE extract(ISOYEAR FROM session_starting_ts) = 2022 \ No newline at end of file