car-loan-optimization/supplementary-and-appendices.md at master · elijah-medina/car-loan-optimization

import numpy as np
import matplotlib.pyplot as plt
import mglearn
import pandas as pd
import pylab as plot
import lmfit as lf
import random
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import warnings
warnings.simplefilter(action='ignore')

pd.options.display.float_format = '{:,.2g}'.format

C:\Users\63917\Anaconda3\lib\site-packages\sklearn\externals\six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", DeprecationWarning)
C:\Users\63917\Anaconda3\lib\site-packages\sklearn\externals\joblib\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)

Framework:

EDA - look for inconsistencies wrt competition
Total lost money for Outcome = 0
- Calculate Sum of (Amount * [(APR-Cost of Fund)/100] * (Term/12))
Targeting & Segmentation (James) Cluster by Loan Terms
- inconsistent outcome
- low outcome
Maximize interest (best combination of term & APR to make outcome = 1) - ML Model (Elijah, Angela, Ria)
Simulate declined to recalculate potential profit

Your task is to help Nomis Solutions to study and explore e-Car's data and to help them highlight potential inefficiencies in e-Car's current model and convince the company that there was value to be captured.

Guide on approaching the task:

Perform exploratory data analysis.
How will you show that the current pricing technique contains "pricing errors"?
- How do we know that there are "pricing errors"? What are we trying to maximize in this project?
How can we make the analysis more manageable? Should we segment the customers?
How should we price the loans?
- Do you think you can recommend the right price to quote?
- Has there been a mis-pricing of APR quotes?
Can we build a systematic approach that can scale with the number of segments given certain customer characteristics?

df = pd.read_excel("NomisB.xlsx", na_values=' ')

Data Segmentation by Car Types

df_N = df[df['Car  Type']=='N']

df_U = df[df['Car  Type']=='U']
df_U = df_U.fillna(0)
df_U = pd.get_dummies(df_U, columns=['Partner Bin'])
df_U = df_U.drop(columns=['Approve Date', 'Car  Type', 'Previous Rate'])
df_U.head()
df_U_target = df_U['Outcome']
df_U_features = df_U.drop('Outcome', axis=1)

df_R = df[df['Car  Type']=='R']
df_R = df_R.fillna(0)
df_R = pd.get_dummies(df_R, columns=['Partner Bin'])
df_R = df_R.drop(columns=['Approve Date', 'Car  Type'])
df_R.head()
df_R_target = df_R['Outcome']
df_R_features = df_R.drop('Outcome', axis=1)

import random
df_a = df_N[df_N['Outcome']==0].reset_index()
data_pts = df_a.shape[0]
rand_pts = random.sample(range(data_pts), 9000)
df_a = df_a.iloc[rand_pts]
df_b = df_N[df_N['Outcome']==1].reset_index()
data_pts = df_b.shape[0]
rand_pts = random.sample(range(data_pts), 9000)
df_b = df_b.iloc[rand_pts]
df_b.head()
df_N = df_a.append(df_b).reset_index().drop(['level_0','index'], axis=1)

df_N = df_N.fillna(0)
df_N.shape

(119059, 12)

df_N = pd.get_dummies(df_N, columns=['Partner Bin'])

df_N = df_N.drop(columns=['Approve Date', 'Car  Type', 'Previous Rate'])

df_N.head()

	Tier	FICO	Term	Amount	Competition rate	Outcome	Rate	Cost of Funds	Partner Bin_1	Partner Bin_3
0	1	743	36	1.9e+04	5	1	4.8	1.8	1	0
1	1	752	60	3.6e+04	5.7	1	5.5	1.8	1	0
3	2	724	60	1.9e+04	5.7	1	5.4	1.8	0	1
4	2	700	72	2.4e+04	6.2	1	7	1.8	0	1
8	1	779	72	2e+04	6.2	1	6.6	1.8	1	0

df_N_target = df_N['Outcome']

df_N_features = df_N.drop(columns='Outcome')

import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=2, ncols=7, figsize=(15, 10))
for i, column in enumerate(df_N.columns):
    sns.distplot(df_N[column],ax=axes[i//7,i%7])
    plt.suptitle('Distribution of Numeric Features', fontsize=18, y=1)

# Compute the correlation matrix
corr = df_N.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(14, 7))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});
ax.axes.set_title("Correlation Matrix", fontsize=18, y=1.01);

Testing of ML algorithms

def knn_class(n_iter, n_nbrs, t_size, df_feat, df_target, best_feat_how=None, scaler=None, **best_feat_kwargs):
    N = n_iter
    train_accuracy_list = []
    test_accuracy_list = []
    for i in range(N):
        X_train, X_test, y_train, y_test = train_test_split(df_feat, df_target, test_size=t_size)

        if scaler == 'ss':
            scale = StandardScaler()
            X_train = scale.fit_transform(X = X_train)
            X_test = scale.transform(X = X_test)
        elif scaler == 'mm':
            scale = MinMaxScaler()
            X_train = scale.fit_transform(X_train)
            X_test = scale.transform(X_test)
        else:
            pass

        training_accuracy = []
        test_accuracy = []
        neighbors_settings = range(1, min(n_nbrs, len(X_train)))

        for n_neighbors in neighbors_settings:
            # build the model
            clf = KNeighborsClassifier(n_neighbors=n_neighbors)
            clf.fit(X_train, y_train)

            # record training set accuracy
            training_accuracy.append(clf.score(X_train, y_train))

            # record generalization accuracy
            test_accuracy.append(clf.score(X_test, y_test))

        train_accuracy_list.append(training_accuracy)
        test_accuracy_list.append(test_accuracy)

    pass
    train_mean = np.array(train_accuracy_list).mean(axis=0)
    train_stdev = np.array(train_accuracy_list).std(axis=0)/np.sqrt(N)

    test_mean = np.array(test_accuracy_list).mean(axis=0)
    test_stdev = np.array(test_accuracy_list).std(axis=0)/np.sqrt(N)


    neighbors_settings_max = np.argmax(test_mean)+1
    test_mean_max = np.round(np.max(test_mean), 4)
    if not(best_feat_how):
        return ['KNN Classifier', test_mean_max*100, f'k = {neighbors_settings_max}', scaler]
    else:
        acc_best = 0 if best_feat_how=='indiv' else 100
        feat_best = 'placeholder'
        for feat in df_feat.columns:
            cols = [feat,] if best_feat_how=='indiv' else [i for i in df_feat.columns if i!=feat]
            niters = best_feat_kwargs.get('n_iter_best_feat', n_iter)
            nnbrs = best_feat_kwargs.get('n_nbrs_best_feat', n_nbrs)
            result = knn_class(niters, nnbrs,
                             0.25, df_feat[cols], df_target)
            if (result[1]>acc_best and best_feat_how=='indiv') or (result[1]<acc_best and best_feat_how=='without'):
                acc_best = result[1]
                feat_best = feat

        return ['KNN Classifier', test_mean_max*100, f'k = {neighbors_settings_max}', scaler, feat_best]

knn_class(10, 30, 0.25, df_N_features, df_N_target, best_feat_how=None)

['KNN Classifier', 75.44, 'k = 11', None]

def pcc(df_target):
    target_count = np.unique(df_target, return_counts=True)[1]
    pcc_125 = np.round(1.25*np.sum((target_count/target_count.sum())**2), 4)
    return pcc_125

pcc(df_R_target)

0.6618

def logistic(df_feat, df_target, n_iter, t_size, reg, scaler, top_feats=1):
    C = [1e-8, 1e-4, 1e-3, 1e-2, 0.1, 0.2,0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]

    score_train = []
    score_test = []
    random_states = random.sample(range(100), n_iter)

    best_feats = {i:np.zeros(len(df_feat.columns)) for i in C}
    best_inter = {i:0 for i in C}
    for seed in random_states:
        training_accuracy = []  
        test_accuracy = []
        X_train, X_test, y_train, y_test = train_test_split(df_feat, df_target, test_size=t_size, random_state=seed)

        if scaler == 'ss':
            scale = StandardScaler()
            X_train = scale.fit_transform(X = X_train)
            X_test = scale.transform(X = X_test)
        elif scaler == 'mm':
            scale = MinMaxScaler()
            X_train = scale.fit_transform(X_train)
            X_test = scale.transform(X_test)
        else:
            pass

        for alpha_run in C:
            lr = LogisticRegression(C=alpha_run, penalty=reg).fit(X_train, y_train)
            training_accuracy.append(lr.score(X_train, y_train))
            test_accuracy.append(lr.score(X_test, y_test))
            if scaler:
                coefs = np.mean(lr.coef_, axis=0)
                inter = np.mean(lr.intercept_, axis=0)
                best_feats[alpha_run] += coefs / n_iter
                best_inter[alpha_run] += inter / n_iter
#                 best_feats[alpha_run].append(df_feat.columns[np.argmax(np.abs(coefs))])

        score_train.append(training_accuracy)
        score_test.append(test_accuracy)
    score = np.mean(score_test, axis=0)
    feat_coeffs = best_feats[C[np.argmax(score)]]
    intercept = best_inter[C[np.argmax(score)]]

    if scaler:
        return [f'Logistic {reg}', np.round(np.amax(score)*100, 4), f'C = {C[np.argmax(score)]}', scaler,
                list(zip(df_feat.columns[np.argsort(np.abs(feat_coeffs))[-top_feats:][::-1]],
                         feat_coeffs[np.argsort(np.abs(feat_coeffs))[-top_feats:][::-1]])), (feat_coeffs, intercept)]
    else:
        return [f'Logistic {reg}', np.round(np.amax(score)*100, 4), f'C = {C[np.argmax(score)]}', scaler]

n = logistic(df_N_features, df_N_target, 20, 0.25, 'l1', 'mm', top_feats=10)

['Logistic l1',
 92.3476,
 'C = 3',
 'mm',
 [('Amount', -9.098392587704907),
  ('Rate', -3.4135979440681745),
  ('Term', 1.8046593255352945),
  ('Cost of Funds', 1.086797954491032),
  ('FICO', -0.663274598966211),
  ('Partner Bin_2', -0.5807589355467511),
  ('Tier', -0.39555558541910035),
  ('Partner Bin_1', 0.3437602061753951),
  ('Competition rate', -0.17793518989959722),
  ('Partner Bin_3', -0.1215966566339388)],
 (array([-0.39555559, -0.6632746 ,  1.80465933, -9.09839259, -0.17793519,
         -3.41359794,  1.08679795,  0.34376021, -0.58075894, -0.12159666]),
  -0.04370093455394698)]

Simulate Logistic Regression

def y_func(coeff, inter, X):
    return np.dot(X,coeff.T) + inter

coeff = n[5][0]
inter = n[5][1]

X = df_N_features.to_numpy()
X = MinMaxScaler().fit_transform(X)
X

array([[0.        , 0.57751938, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.6124031 , 0.66666667, ..., 1.        , 0.        ,
        0.        ],
       [0.33333333, 0.50387597, 0.66666667, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.66666667, 0.63953488, 0.66666667, ..., 1.        , 0.        ,
        0.        ],
       [0.33333333, 0.44573643, 0.33333333, ..., 0.        , 1.        ,
        0.        ],
       [0.66666667, 0.76356589, 0.66666667, ..., 0.        , 0.        ,
        1.        ]])

y = y_func(coeff, inter, X)
y

array([-2.02880516, -2.70801419, -1.63324091, ..., -3.02602811,
       -4.11357686, -2.80025046])

def p(y):
    return 1/(1+np.exp(-y))

plt.scatter(y, p(y), c=df_N_target.to_numpy(), alpha=0.9)

<matplotlib.collections.PathCollection at 0x2182c1fcb38>

c=[df_N_target.to_numpy()==0]
plt.scatter(y[c], p(y)[c])

<matplotlib.collections.PathCollection at 0x2182bb430b8>

c=[df_N_target.to_numpy()==1]
plt.scatter(y[c], p(y)[c])

<matplotlib.collections.PathCollection at 0x2182d4944e0>

$\hat{y} = $
$-9.135 x Amount$
$-3.426 x Rate$
$+1.811 x Term$
$+1.096 x Cost of Funds$
$-0.665 x FICO$
$-0.622 x Partner Bin_2$
$-0.388 x Tier$
$+0.312 x Partner Bin_1$
$-0.188 x Competition rate$
$-0.151 x Partner Bin_3$
$+0.037$

print('$\hat{y} = $')
for i,j in list(zip(df_N_features.columns[np.argsort(np.abs(coeff))[::-1]],
         coeff[np.argsort(np.abs(coeff))[::-1]])):
    print(f'<br>${j:.3f} x {i}$')
print(f'\t{inter:.3f}')

$\hat{y} = $
<br>$-9.135 x Amount$
<br>$-3.426 x Rate$
<br>$1.811 x Term$
<br>$1.096 x Cost of Funds$
<br>$-0.665 x FICO$
<br>$-0.622 x Partner Bin_2$
<br>$-0.388 x Tier$
<br>$0.312 x Partner Bin_1$
<br>$-0.188 x Competition rate$
<br>$-0.151 x Partner Bin_3$
	0.037

import seaborn as sns
fig = plt.figure(figsize=(16,9))
c0=[df_N_target.to_numpy()==0]
c1=[df_N_target.to_numpy()==1]
sns.distplot(p(y)[c0], label='Rejected')
sns.distplot(p(y)[c1], label='Availed')
plt.legend()
plt.xlabel('$P(\hat{y})$')
plt.ylabel('Distribution')
plt.title('Logit Probability Distribution of Availed vs Rejected Loans')
#sns.distplot(p(y))

Text(0.5, 1.0, 'Logit Probability Distribution of Availed vs Rejected Loans')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16,9))
ax = plt.axes(projection='3d')
rand = random.sample(range(len(y)),18000)
ax1 = ax.scatter3D(df_N_features['Amount'].to_numpy()[rand], df_N_features['Rate'].to_numpy()[rand], p(y)[rand],
             c=df_N_target.to_numpy()[rand], alpha=0.5)
ax.view_init(azim=30)
ax.set_xlabel('Amount (USD)')
ax.set_ylabel('Rate')
ax.set_zlabel('$P(\hat{y})$')
ax.legend(*ax1.legend_elements(), title='Rejected/Availed');

fig = plt.figure(figsize=(16,9))
ratedel = [0, 20, 70]
c0=[df_N_target.to_numpy()==0]
c1=[df_N_target.to_numpy()==1]
for r in ratedel:
    x = df_N_features.to_numpy()
    x = MinMaxScaler().fit_transform(x)
    x[:,df_N_features.columns.values.tolist().index('Rate')] *= (100-r)/100
    ypr = y_func(coeff, inter, x)
    sns.distplot(p(ypr)[c0], label=str(r)+'% Reduction')

plt.legend()
plt.xlabel('$P(\hat{y})$')
plt.ylabel('Distribution')
plt.title('Shifted Logit Probability Distribution of Rejected Loans')

Text(0.5, 1.0, 'Shifted Logit Probability Distribution of Rejected Loans')

u = logistic(df_U_features, df_U_target, 20, 0.25, 'l1', 'mm', top_feats=10)

coeff = u[5][0]
inter = u[5][1]
X = df_U_features.to_numpy()
X = MinMaxScaler().fit_transform(X)
y = y_func(coeff, inter, X)

print('$\hat{y} = $')
for i,j in list(zip(df_N_features.columns[np.argsort(np.abs(coeff))[::-1]],
         coeff[np.argsort(np.abs(coeff))[::-1]])):
    print(f'<br>${j:.3f} x {i}$')
print(f'\t{inter:.3f}')

$\hat{y} = $
<br>$-13.024 x Amount$
<br>$-9.702 x Rate$
<br>$1.989 x Term$
<br>$1.924 x Partner Bin_1$
<br>$1.749 x Tier$
<br>$1.546 x Partner Bin_3$
<br>$1.362 x Competition rate$
<br>$1.043 x Partner Bin_2$
<br>$-1.010 x FICO$
<br>$-0.193 x Cost of Funds$
	1.279

$\hat{y} = $
$-13.024 x Amount$
$-9.702 x Rate$
$+1.989 x Term$
$+1.924 x Partner Bin_1$
$+1.749 x Tier$
$+1.546 x Partner Bin_3$
$+1.362 x Competition rate$
$+1.043 x Partner Bin_2$
$-1.010 x FICO$
$-0.193 x Cost of Funds$
$+1.279$

import seaborn as sns
fig = plt.figure(figsize=(16,9))
c0=[df_U_target.to_numpy()==0]
c1=[df_U_target.to_numpy()==1]
sns.distplot(p(y)[c0], label='Rejected')
sns.distplot(p(y)[c1], label='Availed')
plt.legend()
plt.xlabel('$P(\hat{y})$')
plt.ylabel('Distribution')
plt.title('Logit Probability Distribution of Availed vs Rejected Loans')
#sns.distplot(p(y))

Text(0.5, 1.0, 'Logit Probability Distribution of Availed vs Rejected Loans')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16,9))
ax = plt.axes(projection='3d')
rand = random.sample(range(len(y)),18000)
ax1 = ax.scatter3D(df_U_features['Amount'].to_numpy()[rand], df_U_features['Rate'].to_numpy()[rand], p(y)[rand],
             c=df_U_target.to_numpy()[rand], alpha=0.5)
ax.view_init(azim=30)
ax.set_xlabel('Amount (USD)')
ax.set_ylabel('Rate')
ax.set_zlabel('$P(\hat{y})$')
ax.legend(*ax1.legend_elements(), title='Rejected/Availed');

fig = plt.figure(figsize=(16,9))
ratedel = [0, 20, 50]
c0=[df_U_target.to_numpy()==0]
c1=[df_U_target.to_numpy()==1]
for r in ratedel:
    x = df_U_features.to_numpy()
    x = MinMaxScaler().fit_transform(x)
    x[:,df_U_features.columns.values.tolist().index('Rate')] *= (100-r)/100
    ypr = y_func(coeff, inter, x)
    sns.distplot(p(ypr)[c0], label=str(r)+'% Reduction')

plt.legend()
plt.xlabel('$P(\hat{y})$')
plt.ylabel('Distribution')
plt.title('Shifted Logit Probability Distribution of Rejected Loans')

Text(0.5, 1.0, 'Shifted Logit Probability Distribution of Rejected Loans')

r = logistic(df_R_features, df_R_target, 20, 0.25, 'l1', 'mm', top_feats=10)

coeff = r[5][0]
inter = r[5][1]
X = df_R_features.to_numpy()
X = MinMaxScaler().fit_transform(X)
y = y_func(coeff, inter, X)

print('$\hat{y} = $')
for i,j in list(zip(df_R_features.columns[np.argsort(np.abs(coeff))[::-1]],
         coeff[np.argsort(np.abs(coeff))[::-1]])):
    print(f'<br>${j:.3f} x {i}$')
print(f'<br>{inter:.3f}')

$\hat{y} = $
<br>$-5.366 x Rate$
<br>$4.230 x Previous Rate$
<br>$-2.067 x Amount$
<br>$-1.878 x FICO$
<br>$-1.078 x Partner Bin_2$
<br>$1.065 x Competition rate$
<br>$-0.803 x Tier$
<br>$0.646 x Partner Bin_1$
<br>$0.590 x Term$
<br>$0.455 x Cost of Funds$
<br>$0.432 x Partner Bin_3$
<br>0.082

$\hat{y} = $
$-5.366 x Rate$
$+4.230 x Previous Rate$
$-2.067 x Amount$
$-1.878 x FICO$
$-1.078 x Partner Bin_2$
$+1.065 x Competition rate$
$-0.803 x Tier$
$+0.646 x Partner Bin_1$
$+0.590 x Term$
$+0.455 x Cost of Funds$
$+0.432 x Partner Bin_3$
$+0.082$

import seaborn as sns
fig = plt.figure(figsize=(16,9))
c0=[df_R_target.to_numpy()==0]
c1=[df_R_target.to_numpy()==1]
sns.distplot(p(y)[c0], label='Rejected')
sns.distplot(p(y)[c1], label='Availed')
plt.legend()
plt.xlabel('$P(\hat{y})$')
plt.ylabel('Distribution')
plt.title('Logit Probability Distribution of Availed vs Rejected Loans')
#sns.distplot(p(y))

Text(0.5, 1.0, 'Logit Probability Distribution of Availed vs Rejected Loans')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16,9))
ax = plt.axes(projection='3d')
rand = random.sample(range(len(y)),18000)
ax1 = ax.scatter3D(df_R_features['Amount'].to_numpy()[rand], df_R_features['Rate'].to_numpy()[rand], p(y)[rand],
             c=df_R_target.to_numpy()[rand], alpha=0.5)
ax.view_init(azim=30)
ax.set_xlabel('Amount (USD)')
ax.set_ylabel('Rate')
ax.set_zlabel('$P(\hat{y})$')
ax.legend(*ax1.legend_elements(), title='Rejected/Availed');

fig = plt.figure(figsize=(16,9))
ratedel = [0, 20, 50]
c0=[df_R_target.to_numpy()==0]
c1=[df_R_target.to_numpy()==1]
for r in ratedel:
    x = df_R_features.to_numpy()
    x = MinMaxScaler().fit_transform(x)
    x[:,df_R_features.columns.values.tolist().index('Rate')] *= (100-r)/100
    ypr = y_func(coeff, inter, x)
    sns.distplot(p(ypr)[c0], label=str(r)+'% Reduction')

plt.legend()
plt.xlabel('$P(\hat{y})$')
plt.ylabel('Distribution')
plt.title('Shifted Logit Probability Distribution of Rejected Loans')

Text(0.5, 1.0, 'Shifted Logit Probability Distribution of Rejected Loans')

def svm(df_feat, df_target, n_iter, t_size, reg, scaler, top_feats=1):
    C = [1e-8, 1e-4, 1e-3, 1e-2, 0.1, 0.2,0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]

    score_train = []
    score_test = []
    random_states = random.sample(range(100), n_iter)
    best_feats = {i:np.zeros(len(df_feat.columns)) for i in C}
    for seed in random_states:
        training_accuracy = []  
        test_accuracy = []
        X_train, X_test, y_train, y_test = train_test_split(df_feat, df_target, test_size=t_size, random_state=seed)

        if scaler == 'ss':
            scale = StandardScaler()
            X_train = scale.fit_transform(X = X_train)
            X_test = scale.transform(X = X_test)
        elif scaler == 'mm':
            scale = MinMaxScaler()
            X_train = scale.fit_transform(X_train)
            X_test = scale.transform(X_test)
        else:
            pass

        for alpha_run in C:
            if reg == 'l1':
                svc = LinearSVC(C=alpha_run, penalty=reg, loss='squared_hinge', dual=False).fit(X_train, y_train)
            if reg == 'l2':
                svc = LinearSVC(C=alpha_run, penalty=reg).fit(X_train, y_train)
            training_accuracy.append(svc.score(X_train, y_train))
            test_accuracy.append(svc.score(X_test, y_test))
            if scaler:
                coefs = np.mean(svc.coef_, axis=0)
                best_feats[alpha_run] += coefs / len(random_states)

        score_train.append(training_accuracy)
        score_test.append(test_accuracy)

    score = np.mean(score_test, axis=0)
    feat_coeffs = best_feats[C[np.argmax(score)]]

    if scaler:
        return [f'SVM {reg}', np.round(np.amax(score)*100, 4), f'C = {C[np.argmax(score)]}', scaler,
                list(zip(df_feat.columns[np.argsort(np.abs(feat_coeffs))[-top_feats:][::-1]],
                         feat_coeffs[np.argsort(np.abs(feat_coeffs))[-top_feats:][::-1]]))]
    else:
        return [f'SVM {reg}', np.round(np.amax(score)*100, 4), f'C = {C[np.argmax(score)]}', scaler]

svm(df_N_features, df_N_target, 20, 0.25, reg='l2', scaler='mm', top_feats=10)

['SVM l2',
 68.4522,
 'C = 3',
 'mm',
 [('Amount', -3.065494961160274),
  ('Rate', -1.2714260932964554),
  ('Term', 0.7058356945328085),
  ('Partner Bin_1', 0.4310965344476261),
  ('FICO', -0.3998762425772919),
  ('Cost of Funds', 0.39218462259885634),
  ('Partner Bin_3', 0.23468828579738202),
  ('Tier', -0.21411255364856285),
  ('Competition rate', -0.04678933472306601),
  ('Partner Bin_2', -0.026745927908751067)]]

svm(df_N_features, df_N_target, 20, 0.25, reg='l2', scaler='ss', top_feats=10)

['SVM l2',
 68.4933,
 'C = 3',
 'ss',
 [('Amount', -0.3300914100759237),
  ('Term', 0.21582348225869322),
  ('Rate', -0.18953171603578053),
  ('Cost of Funds', 0.09723813266559847),
  ('Partner Bin_2', -0.09400410081091995),
  ('Partner Bin_1', 0.07822553490922914),
  ('FICO', -0.07334792873898931),
  ('Tier', -0.0711939628256175),
  ('Partner Bin_3', -0.01971340167861528),
  ('Competition rate', -0.010391847535466682)]]

svm(df_N_features, df_N_target, 20, 0.25, reg='l2', scaler=None)

['SVM l2', 92.258, 'C = 0.1', None]

svm(df_N_features, df_N_target, 20, 0.25, reg='l1', scaler='mm')

['SVM l1', 92.3329, 'C = 1e-08', 'mm', 'FICO']

svm(df_N_features, df_N_target, 20, 0.25, reg='l1', scaler='ss')

['SVM l1', 92.3202, 'C = 1e-08', 'ss', 'FICO']

df_U_target.shape

(41816,)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Data Segmentation by Car Types

Testing of ML algorithms

Simulate Logistic Regression

FilesExpand file tree

supplementary-and-appendices.md

Latest commit

History

supplementary-and-appendices.md

File metadata and controls

Data Segmentation by Car Types

Testing of ML algorithms

Simulate Logistic Regression