moved sql to notebook

This commit is contained in:
Giorgio Conte 2023-03-06 12:28:58 +00:00
parent ee55127ede
commit 3123d67ddf
4 changed files with 113 additions and 125 deletions

View File

@ -93,6 +93,100 @@
"source": [
"# Vertex AI Pipeline Definition\n",
"\n",
"Let's first define the queries for the features and target creation and the query to train the model\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# this query creates the features for our model and the target value we would like to predict\n",
"\n",
"features_query = \"\"\"\n",
"CREATE VIEW if NOT EXISTS `{project_id}.{dataset}.ecommerce_abt` AS\n",
"WITH abt AS (\n",
" SELECT user_id,\n",
" session_id,\n",
" city,\n",
" postal_code,\n",
" browser,\n",
" traffic_source,\n",
" min(created_at) AS session_starting_ts,\n",
" sum(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) has_purchased\n",
" FROM `bigquery-public-data.thelook_ecommerce.events` \n",
" GROUP BY user_id,\n",
" session_id,\n",
" city,\n",
" postal_code,\n",
" browser,\n",
" traffic_source\n",
"), previous_orders AS (\n",
" SELECT user_id,\n",
" array_agg (struct(created_at AS order_creations_ts,\n",
" o.order_id,\n",
" o.status,\n",
" oi.order_cost)) as user_orders\n",
" FROM `bigquery-public-data.thelook_ecommerce.orders` o\n",
" JOIN (SELECT order_id,\n",
" sum(sale_price) order_cost \n",
" FROM `bigquery-public-data.thelook_ecommerce.order_items`\n",
" GROUP BY 1) oi\n",
" ON o.order_id = oi.order_id\n",
" GROUP BY 1\n",
")\n",
"SELECT abt.*,\n",
" CASE WHEN extract(DAYOFWEEK FROM session_starting_ts) IN (1,7)\n",
" THEN 'WEEKEND' \n",
" ELSE 'WEEKDAY'\n",
" END AS day_of_week,\n",
" extract(HOUR FROM session_starting_ts) hour_of_day,\n",
" (SELECT count(DISTINCT uo.order_id) \n",
" FROM unnest(user_orders) uo \n",
" WHERE uo.order_creations_ts < session_starting_ts \n",
" AND status IN ('Shipped', 'Complete', 'Processing')) AS number_of_successful_orders,\n",
" IFNULL((SELECT sum(DISTINCT uo.order_cost) \n",
" FROM unnest(user_orders) uo \n",
" WHERE uo.order_creations_ts < session_starting_ts \n",
" AND status IN ('Shipped', 'Complete', 'Processing')), 0) AS sum_previous_orders,\n",
" (SELECT count(DISTINCT uo.order_id) \n",
" FROM unnest(user_orders) uo \n",
" WHERE uo.order_creations_ts < session_starting_ts \n",
" AND status IN ('Cancelled', 'Returned')) AS number_of_unsuccessful_orders\n",
"FROM abt \n",
"LEFT JOIN previous_orders pso \n",
"ON abt.user_id = pso.user_id\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# this query create the train job on BQ ML\n",
"train_query = \"\"\"\n",
"CREATE OR REPLACE MODEL `{project_id}.{dataset}.{model_name}`\n",
"OPTIONS(MODEL_TYPE='{model_type}',\n",
" INPUT_LABEL_COLS=['has_purchased'],\n",
" ENABLE_GLOBAL_EXPLAIN=TRUE,\n",
" MODEL_REGISTRY='VERTEX_AI',\n",
" DATA_SPLIT_METHOD = 'RANDOM',\n",
" DATA_SPLIT_EVAL_FRACTION = {split_fraction}\n",
" ) AS \n",
"SELECT * EXCEPT (session_id, session_starting_ts, user_id) \n",
"FROM `{project_id}.{dataset}.ecommerce_abt`\n",
"WHERE extract(ISOYEAR FROM session_starting_ts) = 2022\n",
"\"\"\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"In the following code block, we are defining our Vertex AI pipeline. It is made up of three main steps:\n",
"1. Create a BigQuery dataset that will contain the BigQuery ML models\n",
"2. Train the BigQuery ML model, in this case, a logistic regression\n",
@ -113,13 +207,6 @@
"metadata": {},
"outputs": [],
"source": [
"with open(\"sql/train.sql\") as file:\n",
" train_query = file.read()\n",
"\n",
"with open(\"sql/features.sql\") as file:\n",
" features_query = file.read()\n",
"\n",
"\n",
"@kfp.dsl.pipeline(name='bqml-pipeline', pipeline_root=PIPELINE_ROOT)\n",
"def pipeline(\n",
" model_name: str,\n",
@ -294,6 +381,25 @@
"my_prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# batch prediction on BigQuery\n",
"\n",
"explain_predict_query = \"\"\"\n",
"SELECT *\n",
"FROM ML.EXPLAIN_PREDICT(MODEL `{project_id}.{dataset}.{model_name}`,\n",
" (SELECT * EXCEPT (session_id, session_starting_ts, user_id, has_purchased) \n",
" FROM `{project_id}.{dataset}.ecommerce_abt`\n",
" WHERE extract(ISOYEAR FROM session_starting_ts) = 2023),\n",
" STRUCT(5 AS top_k_features, 0.5 AS threshold))\n",
"LIMIT 100\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,

View File

@ -1,23 +0,0 @@
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
SELECT *
FROM ML.EXPLAIN_PREDICT(MODEL `{project_id}.{dataset}.{model_name}`,
(SELECT * EXCEPT (session_id, session_starting_ts, user_id, has_purchased)
FROM `{project_id}.{dataset}.ecommerce_abt`
WHERE extract(ISOYEAR FROM session_starting_ts) = 2023),
STRUCT(5 AS top_k_features, 0.5 AS threshold))
LIMIT 100

View File

@ -1,68 +0,0 @@
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CREATE VIEW if NOT EXISTS `{project_id}.{dataset}.ecommerce_abt` AS
WITH abt AS (
SELECT user_id,
session_id,
city,
postal_code,
browser,
traffic_source,
min(created_at) AS session_starting_ts,
sum(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) has_purchased
FROM `bigquery-public-data.thelook_ecommerce.events`
GROUP BY user_id,
session_id,
city,
postal_code,
browser,
traffic_source
), previous_orders AS (
SELECT user_id,
array_agg (struct(created_at AS order_creations_ts,
o.order_id,
o.status,
oi.order_cost)) as user_orders
FROM `bigquery-public-data.thelook_ecommerce.orders` o
JOIN (SELECT order_id,
sum(sale_price) order_cost
FROM `bigquery-public-data.thelook_ecommerce.order_items`
GROUP BY 1) oi
ON o.order_id = oi.order_id
GROUP BY 1
)
SELECT abt.*,
CASE WHEN extract(DAYOFWEEK FROM session_starting_ts) IN (1,7)
THEN 'WEEKEND'
ELSE 'WEEKDAY'
END AS day_of_week,
extract(HOUR FROM session_starting_ts) hour_of_day,
(SELECT count(DISTINCT uo.order_id)
FROM unnest(user_orders) uo
WHERE uo.order_creations_ts < session_starting_ts
AND status IN ('Shipped', 'Complete', 'Processing')) AS number_of_successful_orders,
IFNULL((SELECT sum(DISTINCT uo.order_cost)
FROM unnest(user_orders) uo
WHERE uo.order_creations_ts < session_starting_ts
AND status IN ('Shipped', 'Complete', 'Processing')), 0) AS sum_previous_orders,
(SELECT count(DISTINCT uo.order_id)
FROM unnest(user_orders) uo
WHERE uo.order_creations_ts < session_starting_ts
AND status IN ('Cancelled', 'Returned')) AS number_of_unsuccessful_orders
FROM abt
LEFT JOIN previous_orders pso
ON abt.user_id = pso.user_id

View File

@ -1,27 +0,0 @@
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CREATE OR REPLACE MODEL `{project_id}.{dataset}.{model_name}`
OPTIONS(MODEL_TYPE='{model_type}',
INPUT_LABEL_COLS=['has_purchased'],
ENABLE_GLOBAL_EXPLAIN=TRUE,
MODEL_REGISTRY='VERTEX_AI',
DATA_SPLIT_METHOD = 'RANDOM',
DATA_SPLIT_EVAL_FRACTION = {split_fraction}
) AS
SELECT * EXCEPT (session_id, session_starting_ts, user_id)
FROM `{project_id}.{dataset}.ecommerce_abt_table`
WHERE extract(ISOYEAR FROM session_starting_ts) = 2022