-
Notifications
You must be signed in to change notification settings - Fork 100
snippets for xai bqml and looker post #72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
CREATE OR REPLACE MODEL retail_banking.fraud_prediction | ||
OPTIONS(model_type='logistic_reg', labels=['is_fraud']) AS | ||
SELECT * EXCEPT(trans_id) | ||
FROM retail_banking.training_data | ||
-- Account for class imbalance. Alternatively, use AUTO_CLASS_WEIGHTS=True in the model options | ||
WHERE (is_fraud IS TRUE) OR | ||
(is_fraud IS NOT TRUE | ||
AND rand() <=( | ||
SELECT COUNTIF(is_fraud)/COUNT(*) FROM retail_banking.training_data | ||
) | ||
); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
CREATE OR REPLACE TABLE retail_banking.training_data as ( | ||
SELECT | ||
card_transactions.trans_id AS trans_id, | ||
card_transactions.is_fraud AS is_fraud, | ||
--amount for transaction: higher amounts are more likely to be fraud | ||
cast(card_transactions.amount as FLOAT64) AS card_transactions_amount, | ||
|
||
--distance from the customers home: further distances are more likely to be fraud | ||
ST_DISTANCE((ST_GEOGPOINT((cast(card_transactions.merchant_lon as FLOAT64)), | ||
(cast(card_transactions.merchant_lat as FLOAT64)))), | ||
(ST_GeogPoint((cast(SPLIT(client.address,'|')[ | ||
OFFSET | ||
(4)] as float64)), | ||
(cast(SPLIT(client.address,'|')[ | ||
OFFSET | ||
(3)] as float64))))) AS card_transactions_transaction_distance, | ||
|
||
--hour that transaction occured: fraud occurs in middle of night (usually between midnight and 4 am) | ||
EXTRACT(HOUR FROM TIMESTAMP(CONCAT(card_transactions.trans_date,' ',card_transactions.trans_time)) ) AS card_transactions_transaction_hour_of_day | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is perhaps too long. Could you try using the || operator to concat, and/or break this line into multiple lines? |
||
|
||
FROM `looker-private-demo.retail_banking.card_transactions` AS card_transactions | ||
LEFT JOIN `looker-private-demo.retail_banking.card` AS card | ||
ON card.card_number = card_transactions.cc_number | ||
LEFT JOIN `looker-private-demo.retail_banking.disp` AS disp | ||
ON card.disp_id = disp.disp_id | ||
LEFT JOIN `looker-private-demo.retail_banking.client`AS client | ||
ON disp.client_id = client.client_id | ||
); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
SELECT * FROM | ||
ML.EXPLAIN_PREDICT(MODEL retail_banking.fraud_prediction, ( | ||
SELECT | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be helpful to add a comment as to what kind of transaction each of these are ... e.g. "a sample datapoint with an unusually high transaction amount" |
||
'001' as trans_id, | ||
500.00 as card_transactions_amount, | ||
600 as card_transactions_transaction_distance, | ||
2 as card_transactions_transaction_hour_of_day | ||
UNION ALL | ||
SELECT | ||
'002' as trans_id, | ||
5.25 as card_transactions_amount, | ||
2 as card_transactions_transaction_distance, | ||
13 as card_transactions_transaction_hour_of_day | ||
UNION ALL | ||
SELECT | ||
'003' as trans_id, | ||
175.50 as card_transactions_amount, | ||
45 as card_transactions_transaction_distance, | ||
10 as card_transactions_transaction_hour_of_day | ||
), STRUCT(0.55 AS threshold) | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ST_GeogPoint seems to use camelcase unexpectedly -- perhaps all caps would be better?