moved sql to notebook
This commit is contained in:
parent
ee55127ede
commit
3123d67ddf
|
@ -93,6 +93,100 @@
|
|||
"source": [
|
||||
"# Vertex AI Pipeline Definition\n",
|
||||
"\n",
|
||||
"Let's first define the queries for the features and target creation and the query to train the model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# this query creates the features for our model and the target value we would like to predict\n",
|
||||
"\n",
|
||||
"features_query = \"\"\"\n",
|
||||
"CREATE VIEW if NOT EXISTS `{project_id}.{dataset}.ecommerce_abt` AS\n",
|
||||
"WITH abt AS (\n",
|
||||
" SELECT user_id,\n",
|
||||
" session_id,\n",
|
||||
" city,\n",
|
||||
" postal_code,\n",
|
||||
" browser,\n",
|
||||
" traffic_source,\n",
|
||||
" min(created_at) AS session_starting_ts,\n",
|
||||
" sum(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) has_purchased\n",
|
||||
" FROM `bigquery-public-data.thelook_ecommerce.events` \n",
|
||||
" GROUP BY user_id,\n",
|
||||
" session_id,\n",
|
||||
" city,\n",
|
||||
" postal_code,\n",
|
||||
" browser,\n",
|
||||
" traffic_source\n",
|
||||
"), previous_orders AS (\n",
|
||||
" SELECT user_id,\n",
|
||||
" array_agg (struct(created_at AS order_creations_ts,\n",
|
||||
" o.order_id,\n",
|
||||
" o.status,\n",
|
||||
" oi.order_cost)) as user_orders\n",
|
||||
" FROM `bigquery-public-data.thelook_ecommerce.orders` o\n",
|
||||
" JOIN (SELECT order_id,\n",
|
||||
" sum(sale_price) order_cost \n",
|
||||
" FROM `bigquery-public-data.thelook_ecommerce.order_items`\n",
|
||||
" GROUP BY 1) oi\n",
|
||||
" ON o.order_id = oi.order_id\n",
|
||||
" GROUP BY 1\n",
|
||||
")\n",
|
||||
"SELECT abt.*,\n",
|
||||
" CASE WHEN extract(DAYOFWEEK FROM session_starting_ts) IN (1,7)\n",
|
||||
" THEN 'WEEKEND' \n",
|
||||
" ELSE 'WEEKDAY'\n",
|
||||
" END AS day_of_week,\n",
|
||||
" extract(HOUR FROM session_starting_ts) hour_of_day,\n",
|
||||
" (SELECT count(DISTINCT uo.order_id) \n",
|
||||
" FROM unnest(user_orders) uo \n",
|
||||
" WHERE uo.order_creations_ts < session_starting_ts \n",
|
||||
" AND status IN ('Shipped', 'Complete', 'Processing')) AS number_of_successful_orders,\n",
|
||||
" IFNULL((SELECT sum(DISTINCT uo.order_cost) \n",
|
||||
" FROM unnest(user_orders) uo \n",
|
||||
" WHERE uo.order_creations_ts < session_starting_ts \n",
|
||||
" AND status IN ('Shipped', 'Complete', 'Processing')), 0) AS sum_previous_orders,\n",
|
||||
" (SELECT count(DISTINCT uo.order_id) \n",
|
||||
" FROM unnest(user_orders) uo \n",
|
||||
" WHERE uo.order_creations_ts < session_starting_ts \n",
|
||||
" AND status IN ('Cancelled', 'Returned')) AS number_of_unsuccessful_orders\n",
|
||||
"FROM abt \n",
|
||||
"LEFT JOIN previous_orders pso \n",
|
||||
"ON abt.user_id = pso.user_id\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# this query create the train job on BQ ML\n",
|
||||
"train_query = \"\"\"\n",
|
||||
"CREATE OR REPLACE MODEL `{project_id}.{dataset}.{model_name}`\n",
|
||||
"OPTIONS(MODEL_TYPE='{model_type}',\n",
|
||||
" INPUT_LABEL_COLS=['has_purchased'],\n",
|
||||
" ENABLE_GLOBAL_EXPLAIN=TRUE,\n",
|
||||
" MODEL_REGISTRY='VERTEX_AI',\n",
|
||||
" DATA_SPLIT_METHOD = 'RANDOM',\n",
|
||||
" DATA_SPLIT_EVAL_FRACTION = {split_fraction}\n",
|
||||
" ) AS \n",
|
||||
"SELECT * EXCEPT (session_id, session_starting_ts, user_id) \n",
|
||||
"FROM `{project_id}.{dataset}.ecommerce_abt`\n",
|
||||
"WHERE extract(ISOYEAR FROM session_starting_ts) = 2022\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the following code block, we are defining our Vertex AI pipeline. It is made up of three main steps:\n",
|
||||
"1. Create a BigQuery dataset that will contain the BigQuery ML models\n",
|
||||
"2. Train the BigQuery ML model, in this case, a logistic regression\n",
|
||||
|
@ -113,13 +207,6 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"sql/train.sql\") as file:\n",
|
||||
" train_query = file.read()\n",
|
||||
"\n",
|
||||
"with open(\"sql/features.sql\") as file:\n",
|
||||
" features_query = file.read()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@kfp.dsl.pipeline(name='bqml-pipeline', pipeline_root=PIPELINE_ROOT)\n",
|
||||
"def pipeline(\n",
|
||||
" model_name: str,\n",
|
||||
|
@ -294,6 +381,25 @@
|
|||
"my_prediction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# batch prediction on BigQuery\n",
|
||||
"\n",
|
||||
"explain_predict_query = \"\"\"\n",
|
||||
"SELECT *\n",
|
||||
"FROM ML.EXPLAIN_PREDICT(MODEL `{project_id}.{dataset}.{model_name}`,\n",
|
||||
" (SELECT * EXCEPT (session_id, session_starting_ts, user_id, has_purchased) \n",
|
||||
" FROM `{project_id}.{dataset}.ecommerce_abt`\n",
|
||||
" WHERE extract(ISOYEAR FROM session_starting_ts) = 2023),\n",
|
||||
" STRUCT(5 AS top_k_features, 0.5 AS threshold))\n",
|
||||
"LIMIT 100\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 Google LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
SELECT *
|
||||
FROM ML.EXPLAIN_PREDICT(MODEL `{project_id}.{dataset}.{model_name}`,
|
||||
(SELECT * EXCEPT (session_id, session_starting_ts, user_id, has_purchased)
|
||||
FROM `{project_id}.{dataset}.ecommerce_abt`
|
||||
WHERE extract(ISOYEAR FROM session_starting_ts) = 2023),
|
||||
STRUCT(5 AS top_k_features, 0.5 AS threshold))
|
||||
LIMIT 100
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 Google LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CREATE VIEW if NOT EXISTS `{project_id}.{dataset}.ecommerce_abt` AS
|
||||
WITH abt AS (
|
||||
SELECT user_id,
|
||||
session_id,
|
||||
city,
|
||||
postal_code,
|
||||
browser,
|
||||
traffic_source,
|
||||
min(created_at) AS session_starting_ts,
|
||||
sum(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) has_purchased
|
||||
FROM `bigquery-public-data.thelook_ecommerce.events`
|
||||
GROUP BY user_id,
|
||||
session_id,
|
||||
city,
|
||||
postal_code,
|
||||
browser,
|
||||
traffic_source
|
||||
), previous_orders AS (
|
||||
SELECT user_id,
|
||||
array_agg (struct(created_at AS order_creations_ts,
|
||||
o.order_id,
|
||||
o.status,
|
||||
oi.order_cost)) as user_orders
|
||||
FROM `bigquery-public-data.thelook_ecommerce.orders` o
|
||||
JOIN (SELECT order_id,
|
||||
sum(sale_price) order_cost
|
||||
FROM `bigquery-public-data.thelook_ecommerce.order_items`
|
||||
GROUP BY 1) oi
|
||||
ON o.order_id = oi.order_id
|
||||
GROUP BY 1
|
||||
)
|
||||
SELECT abt.*,
|
||||
CASE WHEN extract(DAYOFWEEK FROM session_starting_ts) IN (1,7)
|
||||
THEN 'WEEKEND'
|
||||
ELSE 'WEEKDAY'
|
||||
END AS day_of_week,
|
||||
extract(HOUR FROM session_starting_ts) hour_of_day,
|
||||
(SELECT count(DISTINCT uo.order_id)
|
||||
FROM unnest(user_orders) uo
|
||||
WHERE uo.order_creations_ts < session_starting_ts
|
||||
AND status IN ('Shipped', 'Complete', 'Processing')) AS number_of_successful_orders,
|
||||
IFNULL((SELECT sum(DISTINCT uo.order_cost)
|
||||
FROM unnest(user_orders) uo
|
||||
WHERE uo.order_creations_ts < session_starting_ts
|
||||
AND status IN ('Shipped', 'Complete', 'Processing')), 0) AS sum_previous_orders,
|
||||
(SELECT count(DISTINCT uo.order_id)
|
||||
FROM unnest(user_orders) uo
|
||||
WHERE uo.order_creations_ts < session_starting_ts
|
||||
AND status IN ('Cancelled', 'Returned')) AS number_of_unsuccessful_orders
|
||||
FROM abt
|
||||
LEFT JOIN previous_orders pso
|
||||
ON abt.user_id = pso.user_id
|
|
@ -1,27 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 Google LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CREATE OR REPLACE MODEL `{project_id}.{dataset}.{model_name}`
|
||||
OPTIONS(MODEL_TYPE='{model_type}',
|
||||
INPUT_LABEL_COLS=['has_purchased'],
|
||||
ENABLE_GLOBAL_EXPLAIN=TRUE,
|
||||
MODEL_REGISTRY='VERTEX_AI',
|
||||
DATA_SPLIT_METHOD = 'RANDOM',
|
||||
DATA_SPLIT_EVAL_FRACTION = {split_fraction}
|
||||
) AS
|
||||
SELECT * EXCEPT (session_id, session_starting_ts, user_id)
|
||||
FROM `{project_id}.{dataset}.ecommerce_abt_table`
|
||||
WHERE extract(ISOYEAR FROM session_starting_ts) = 2022
|
Loading…
Reference in New Issue