diff --git a/README.md b/README.md index 65b9623..868349e 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ From sample dataset to activation, these componentized patterns are designed to ### Financial use cases * Fraud detection * How to build a real-time credit card fraud detection solution. ([Code][ccfraud_code] | [Blogpost][ccfraud_techblog] | [Video][ccfraud_video]) + * How to use explainable AI to understand a fraud prediction. ([Code][xai_code] | [Blogpost][xai_blog]) [gaming_propen_code]: gaming/propensity-model/bqml @@ -48,7 +49,8 @@ From sample dataset to activation, these componentized patterns are designed to [ccfraud_code]: https://gitlab.qdatalabs.com/uk-gtm/patterns/cc_fraud_detection/tree/master [ccfraud_techblog]: https://cloud.google.com/blog/products/data-analytics/how-to-build-a-fraud-detection-solution [ccfraud_video]: https://youtu.be/qQnxq3COr9Q - +[xai_code]: finance/explainable-fraud-model-bqml-looker +[xai_blog]: https://cloud.google.com/blog/products/data-analytics/explainable-ai-using-bigquery-machine-learning-and-looker diff --git a/finance/explainable-fraud-model-bqml-looker/bqml_model.view b/finance/explainable-fraud-model-bqml-looker/bqml_model.view new file mode 100644 index 0000000..e69de29 diff --git a/finance/explainable-fraud-model-bqml-looker/create_models.sql b/finance/explainable-fraud-model-bqml-looker/create_models.sql new file mode 100644 index 0000000..202b462 --- /dev/null +++ b/finance/explainable-fraud-model-bqml-looker/create_models.sql @@ -0,0 +1,11 @@ +CREATE OR REPLACE MODEL retail_banking.fraud_prediction + OPTIONS(model_type='logistic_reg', labels=['is_fraud']) AS + SELECT * EXCEPT(trans_id) + FROM retail_banking.training_data + -- Account for class imbalance. Alternatively, use AUTO_CLASS_WEIGHTS=True in the model options + WHERE (is_fraud IS TRUE) OR + (is_fraud IS NOT TRUE + AND rand() <=( + SELECT COUNTIF(is_fraud)/COUNT(*) FROM retail_banking.training_data + ) +); \ No newline at end of file diff --git a/finance/explainable-fraud-model-bqml-looker/create_training_data.sql b/finance/explainable-fraud-model-bqml-looker/create_training_data.sql new file mode 100644 index 0000000..15f27df --- /dev/null +++ b/finance/explainable-fraud-model-bqml-looker/create_training_data.sql @@ -0,0 +1,29 @@ + +CREATE OR REPLACE TABLE retail_banking.training_data as ( + SELECT + card_transactions.trans_id AS trans_id, + card_transactions.is_fraud AS is_fraud, + --amount for transaction: higher amounts are more likely to be fraud + cast(card_transactions.amount as FLOAT64) AS card_transactions_amount, + + --distance from the customers home: further distances are more likely to be fraud + ST_DISTANCE((ST_GEOGPOINT((cast(card_transactions.merchant_lon as FLOAT64)), + (cast(card_transactions.merchant_lat as FLOAT64)))), + (ST_GeogPoint((cast(SPLIT(client.address,'|')[ + OFFSET + (4)] as float64)), + (cast(SPLIT(client.address,'|')[ + OFFSET + (3)] as float64))))) AS card_transactions_transaction_distance, + + --hour that transaction occured: fraud occurs in middle of night (usually between midnight and 4 am) + EXTRACT(HOUR FROM TIMESTAMP(CONCAT(card_transactions.trans_date,' ',card_transactions.trans_time)) ) AS card_transactions_transaction_hour_of_day + + FROM `looker-private-demo.retail_banking.card_transactions` AS card_transactions + LEFT JOIN `looker-private-demo.retail_banking.card` AS card + ON card.card_number = card_transactions.cc_number + LEFT JOIN `looker-private-demo.retail_banking.disp` AS disp + ON card.disp_id = disp.disp_id + LEFT JOIN `looker-private-demo.retail_banking.client`AS client + ON disp.client_id = client.client_id +); \ No newline at end of file diff --git a/finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql b/finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql new file mode 100644 index 0000000..b66123f --- /dev/null +++ b/finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql @@ -0,0 +1,21 @@ +SELECT * FROM + ML.EXPLAIN_PREDICT(MODEL retail_banking.fraud_prediction, ( + SELECT + '001' as trans_id, + 500.00 as card_transactions_amount, + 600 as card_transactions_transaction_distance, + 2 as card_transactions_transaction_hour_of_day + UNION ALL + SELECT + '002' as trans_id, + 5.25 as card_transactions_amount, + 2 as card_transactions_transaction_distance, + 13 as card_transactions_transaction_hour_of_day + UNION ALL + SELECT + '003' as trans_id, + 175.50 as card_transactions_amount, + 45 as card_transactions_transaction_distance, + 10 as card_transactions_transaction_hour_of_day + ), STRUCT(0.55 AS threshold) +) \ No newline at end of file