Skip to content

Commit 2dcef73

Browse files
authored
Add feast example (#19)
* edit readme * move flight example into separate folder * add new feast example * edit gitignore * add imgs * minor edits * edit why page * fix branch name * edit home page * add new tutorial * remove separate data model page * minor change
1 parent 42b4e99 commit 2dcef73

32 files changed

+5878
-366
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
/docs/_build
2-
/source/.doctrees
2+
/source/.doctrees
3+
feature_repo/data/registry.db
4+
/encoder.bin
5+
/model.bin

app.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import datetime
2+
from collections import OrderedDict
3+
4+
import numpy as np
5+
import pandas as pd
6+
import shap
7+
import streamlit as st
8+
from matplotlib import pyplot as plt
9+
10+
from credit_model import CreditScoringModel
11+
12+
st.set_page_config(layout="wide")
13+
model = CreditScoringModel()
14+
if not model.is_model_trained():
15+
raise Exception("The credit scoring model has not been trained. Please run `python run.py`.")
16+
17+
18+
def get_loan_request():
19+
zipcode = st.sidebar.text_input("Zip code", "94109")
20+
date_of_birth = st.sidebar.date_input(
21+
"Date of birth", value=datetime.date(year=1986, day=19, month=3)
22+
)
23+
ssn_last_four = st.sidebar.text_input(
24+
"Last four digits of social security number", "3643"
25+
)
26+
dob_ssn = f"{date_of_birth.strftime('%Y%m%d')}_{str(ssn_last_four)}"
27+
age = st.sidebar.slider("Age", 0, 130, 25)
28+
income = st.sidebar.slider("Yearly Income", 0, 1000000, 120000)
29+
person_home_ownership = st.sidebar.selectbox(
30+
"Do you own or rent your home?", ("RENT", "MORTGAGE", "OWN")
31+
)
32+
33+
employment = st.sidebar.slider(
34+
"How long have you been employed (months)?", 0, 120, 12
35+
)
36+
37+
loan_intent = st.sidebar.selectbox(
38+
"Why do you want to apply for a loan?",
39+
(
40+
"PERSONAL",
41+
"VENTURE",
42+
"HOMEIMPROVEMENT",
43+
"EDUCATION",
44+
"MEDICAL",
45+
"DEBTCONSOLIDATION",
46+
),
47+
)
48+
49+
amount = st.sidebar.slider("Loan amount", 0, 100000, 10000)
50+
interest = st.sidebar.slider("Preferred interest rate", 1.0, 25.0, 12.0, step=0.1)
51+
return OrderedDict(
52+
{
53+
"zipcode": [int(zipcode)],
54+
"dob_ssn": [dob_ssn],
55+
"person_age": [age],
56+
"person_income": [income],
57+
"person_home_ownership": [person_home_ownership],
58+
"person_emp_length": [float(employment)],
59+
"loan_intent": [loan_intent],
60+
"loan_amnt": [amount],
61+
"loan_int_rate": [interest],
62+
}
63+
)
64+
65+
66+
# Application
67+
st.title("Loan Application")
68+
69+
# Input Side Bar
70+
st.header("User input:")
71+
loan_request = get_loan_request()
72+
df = pd.DataFrame.from_dict(loan_request)
73+
df
74+
75+
# Full feature vector
76+
st.header("Feature vector (user input + zipcode features + user features):")
77+
vector = model._get_online_features_from_feast(loan_request)
78+
ordered_vector = loan_request.copy()
79+
key_list = vector.keys()
80+
key_list = sorted(key_list)
81+
for vector_key in key_list:
82+
if vector_key not in ordered_vector:
83+
ordered_vector[vector_key] = vector[vector_key]
84+
df = pd.DataFrame.from_dict(ordered_vector)
85+
df
86+
87+
# Results of prediction
88+
st.header("Model prediction:")
89+
result = model.predict(loan_request)
90+
91+
if result == 0:
92+
st.success("Your loan has been approved!")
93+
elif result == 1:
94+
st.error("Your loan has been rejected!")
95+
96+
97+
# Feature importance - todo
98+
"""
99+
st.header("Feature Importance")
100+
X = pd.read_parquet("feature_repo/data/training_dataset_sample.parquet")
101+
explainer = shap.TreeExplainer(model.classifier)
102+
shap_values = explainer.shap_values(X)
103+
left, mid, right = st.columns(3)
104+
with left:
105+
plt.title("Feature importance based on SHAP values")
106+
print(X.shape)
107+
print(shap_values[3].shape)
108+
shap.summary_plot(shap_values[1], X)
109+
st.set_option("deprecation.showPyplotGlobalUse", False)
110+
st.pyplot(bbox_inches="tight")
111+
st.write("---")
112+
113+
with mid:
114+
plt.title("Feature importance based on SHAP values (Bar)")
115+
shap.summary_plot(shap_values, X, plot_type="bar")
116+
st.pyplot(bbox_inches="tight")
117+
"""

credit_model.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from pathlib import Path
2+
3+
import feast
4+
import joblib
5+
import pandas as pd
6+
from sklearn import tree
7+
from sklearn.exceptions import NotFittedError
8+
from sklearn.preprocessing import OrdinalEncoder
9+
from sklearn.utils.validation import check_is_fitted
10+
11+
12+
class CreditScoringModel:
13+
categorical_features = [
14+
"person_home_ownership",
15+
"loan_intent",
16+
"city",
17+
"state",
18+
"location_type",
19+
]
20+
21+
feast_features = [
22+
"zipcode_features:city",
23+
"zipcode_features:state",
24+
"zipcode_features:location_type",
25+
"zipcode_features:tax_returns_filed",
26+
"zipcode_features:population",
27+
"zipcode_features:total_wages",
28+
"credit_history:credit_card_due",
29+
"credit_history:mortgage_due",
30+
"credit_history:student_loan_due",
31+
"credit_history:vehicle_loan_due",
32+
"credit_history:hard_pulls",
33+
"credit_history:missed_payments_2y",
34+
"credit_history:missed_payments_1y",
35+
"credit_history:missed_payments_6m",
36+
"credit_history:bankruptcies",
37+
]
38+
39+
target = "loan_status"
40+
model_filename = "model.bin"
41+
encoder_filename = "encoder.bin"
42+
43+
def __init__(self):
44+
# Load model
45+
if Path(self.model_filename).exists():
46+
self.classifier = joblib.load(self.model_filename)
47+
else:
48+
self.classifier = tree.DecisionTreeClassifier()
49+
50+
# Load ordinal encoder
51+
if Path(self.encoder_filename).exists():
52+
self.encoder = joblib.load(self.encoder_filename)
53+
else:
54+
self.encoder = OrdinalEncoder()
55+
56+
# Set up feature store
57+
self.fs = feast.FeatureStore(repo_path="feature_repo")
58+
59+
def train(self, loans):
60+
train_X, train_Y = self._get_training_features(loans)
61+
62+
self.classifier.fit(train_X[sorted(train_X)], train_Y)
63+
joblib.dump(self.classifier, self.model_filename)
64+
65+
def _get_training_features(self, loans):
66+
training_df = self.fs.get_historical_features(
67+
entity_df=loans, features=self.feast_features
68+
).to_df()
69+
70+
self._fit_ordinal_encoder(training_df)
71+
self._apply_ordinal_encoding(training_df)
72+
train_X = training_df[
73+
training_df.columns.drop(self.target)
74+
.drop("event_timestamp")
75+
.drop("created_timestamp__")
76+
.drop("loan_id")
77+
.drop("zipcode")
78+
.drop("dob_ssn")
79+
]
80+
train_X = train_X.reindex(sorted(train_X.columns), axis=1)
81+
train_Y = training_df.loc[:, self.target]
82+
83+
return train_X, train_Y
84+
85+
def _fit_ordinal_encoder(self, requests):
86+
self.encoder.fit(requests[self.categorical_features])
87+
joblib.dump(self.encoder, self.encoder_filename)
88+
89+
def _apply_ordinal_encoding(self, requests):
90+
requests[self.categorical_features] = self.encoder.transform(
91+
requests[self.categorical_features]
92+
)
93+
94+
def predict(self, request):
95+
# Get online features from Feast
96+
feature_vector = self._get_online_features_from_feast(request)
97+
98+
# Join features to request features
99+
features = request.copy()
100+
features.update(feature_vector)
101+
features_df = pd.DataFrame.from_dict(features)
102+
103+
# Apply ordinal encoding to categorical features
104+
self._apply_ordinal_encoding(features_df)
105+
106+
# Sort columns
107+
features_df = features_df.reindex(sorted(features_df.columns), axis=1)
108+
109+
# Drop unnecessary columns
110+
features_df = features_df[features_df.columns.drop("zipcode").drop("dob_ssn")]
111+
112+
# Make prediction
113+
features_df["prediction"] = self.classifier.predict(features_df)
114+
115+
# return result of credit scoring
116+
return features_df["prediction"].iloc[0]
117+
118+
def _get_online_features_from_feast(self, request):
119+
zipcode = request["zipcode"][0]
120+
dob_ssn = request["dob_ssn"][0]
121+
122+
return self.fs.get_online_features(
123+
entity_rows=[{"zipcode": zipcode, "dob_ssn": dob_ssn}],
124+
features=self.feast_features,
125+
).to_dict()
126+
127+
def is_model_trained(self):
128+
try:
129+
check_is_fitted(self.classifier, "tree_")
130+
except NotFittedError:
131+
return False
132+
return True
85.8 KB
Loading
52.4 KB
Loading
72.5 KB
Loading
37.5 KB
Loading
148 KB
Loading
Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
1-
# What is a feature store?
2-
Feature stores are central data stores to power operational machine learning models. They help you store transformed feature values in a scalable and performant database. Real-time inference requires features to be returned to applications with low latency at scale. This is where ScyllaDB can play a crucial role in your machine learning infrastructure.
1+
# Why ScyllaDB for feature stores?
2+
Feature store is a central data store to power operational machine learning models. They help you store transformed feature values in a scalable and performant database. Real-time inference requires features to be returned to applications with low latency at scale. This is where ScyllaDB can play a crucial role in your machine learning infrastructure.
33

44
[![](https://mermaid.ink/img/pako:eNptkLtuwzAMRX-F4NDJ_gEPBdq62bI0mWp7ICRaFqqHQckpgjj_XtVJOxTlxMe5xCUvqKJmbHB08VNNJBmObR-gxFN3FAppjOJZwwOQMcKGcik0ZRqgrh_hhr50B3V2jtpnqCEGZwNDylHIMAx3ZMPXHWc1QRaywQazLUqcV2i7ffHhfif_qUamvAjDiVXZnVZ4vYsSy-mvZte9Mbk6W89A85wGrNCzeLK6XHv5RnvME3vusSmpJvnosQ_XwtGS4-EcFDZZFq5wmYtPbi0ZIY_NSC6VLmtbbOxv79u-WOFM4T3GH-b6BaPtdAY?type=png)](https://mermaid.live/edit#pako:eNptkLtuwzAMRX-F4NDJ_gEPBdq62bI0mWp7ICRaFqqHQckpgjj_XtVJOxTlxMe5xCUvqKJmbHB08VNNJBmObR-gxFN3FAppjOJZwwOQMcKGcik0ZRqgrh_hhr50B3V2jtpnqCEGZwNDylHIMAx3ZMPXHWc1QRaywQazLUqcV2i7ffHhfif_qUamvAjDiVXZnVZ4vYsSy-mvZte9Mbk6W89A85wGrNCzeLK6XHv5RnvME3vusSmpJvnosQ_XwtGS4-EcFDZZFq5wmYtPbi0ZIY_NSC6VLmtbbOxv79u-WOFM4T3GH-b6BaPtdAY)
55

66

7-
## How can ScyllaDB help you build a feature store?
8-
ScyllaDB is a real-time high-throughput NoSQL database that is best suited for feature stores where you require low latency consistently, and need peta-byte scalability.
7+
## Why should you consider ScyllaDB as feature store?
8+
ScyllaDB is a real-time NoSQL database that is best suited for feature store use cases where you require low latency (e.g. model serving), high throughout (e.g. training) and need peta-byte scalability.
99

10-
* **Low-latency**: ScyllaDB can provide <10 ms P99 latency. Low latency can speed up training time and leads to faster model development.
11-
* **High-throughput**: Training requires huge amounts of data and processing large datasets with many millions of operations per second - something that ScyllaDB excels at.
12-
* **Large-scale**: ScyllaDB can handle petabytes of data while still providing great performance.
10+
* **Low-latency**: ScyllaDB can provide <1 ms P99 latency. For real-time machine learning apps, an online feature store is required to meet strict latency requirements. ScyllaDB is an excellent choice for an online store (Read how [Medium is using ScyllaDB](https://medium.engineering/scylladb-implementation-lists-in-mediums-feature-store-part-2-905299c89392) as a feature store.)
11+
* **High-throughput**: Training requires querying huge amounts of data and processing large datasets with possibly millions of operations per second - something that ScyllaDB excels at
12+
* **Large-scale**: ScyllaDB can handle petabytes of data while still keeping latency low and predictable great performance.
13+
* **High availability**: ScyllaDB is a highly available database. With its distributed architecture, ScyllaDB keeps your feature store database always up and running no matter what
14+
* **Easy to migration**: ScyllaDB is compatible with DynamoDB API and Cassandra which means it's simple to migrate over from legacy solutions.
15+
* **Integration with Feast**: ScyllaDB integrates well with the popular open-source feature store framework, Feast. Example architecture with Feast and ScyllaDB:
16+
17+
![scylla feast architecture](/_static/img/scylla-feast.jpg)

docs/source/conf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212

1313
# Builds documentation for the following tags and branches.
1414
TAGS = []
15-
BRANCHES = ["master"]
15+
BRANCHES = ["main"]
1616
# Sets the latest version.
17-
LATEST_VERSION = "master"
17+
LATEST_VERSION = "main"
1818
# Set which versions are not released yet.
1919
UNSTABLE_VERSIONS = [""]
2020
# Set which versions are deprecated
@@ -44,7 +44,7 @@
4444
master_doc = "index"
4545

4646
# General information about the project.
47-
project = "ScyllaDB feature store example docs"
47+
project = "Feature Store"
4848
copyright = str(date.today().year) + ", ScyllaDB. All rights reserved."
4949
author = u"ScyllaDB Project Contributors"
5050

@@ -109,7 +109,7 @@
109109
"github_issues_repository": "scylladb/scylladb-feature-store",
110110
"github_repository": "scylladb/scylladb-feature-store",
111111
"site_description": "ScyllaDB feature store example.",
112-
"hide_version_dropdown": ["master"],
112+
"hide_version_dropdown": ["main"],
113113
"versions_unstable": UNSTABLE_VERSIONS,
114114
"versions_deprecated": DEPRECATED_VERSIONS,
115115
}

0 commit comments

Comments
 (0)