Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4bb4e1d
fix: gravitational reproduce bug
zkhotanlou Sep 14, 2025
0566741
fix: claproar flaky test
zkhotanlou Sep 14, 2025
f58d042
fix: github workflow updates
zkhotanlou Sep 14, 2025
b0b85fb
adding reproduce for ROAR, some kinks seem to presists, need fixing
HashirA123 Oct 3, 2025
33c932d
Getting results with the Linear model
HashirA123 Oct 22, 2025
11bbbe2
Shifted approach to reproduction
HashirA123 Oct 24, 2025
2b80eed
Merge pull request #23 from zkhotanlou/zahra/fix-bug
zkhotanlou Oct 25, 2025
45ae92f
reduced duplication in loading german data
HashirA123 Oct 25, 2025
5863322
Reproduction with german dataset on ROAR with LR
HashirA123 Oct 25, 2025
e72b7df
Seperate asserts for linear and mlp in reproduce
HashirA123 Oct 25, 2025
2223fa2
Changed a parameter to the loadData and loadModel
HashirA123 Oct 25, 2025
8c28db3
adding reproduce for ROAR, some kinks seem to presists, need fixing
HashirA123 Oct 3, 2025
2c806d2
Getting results with the Linear model
HashirA123 Oct 22, 2025
eff95e6
Shifted approach to reproduction
HashirA123 Oct 24, 2025
4ac5a78
reduced duplication in loading german data
HashirA123 Oct 25, 2025
488dfb9
Reproduction with german dataset on ROAR with LR
HashirA123 Oct 25, 2025
294ae01
Seperate asserts for linear and mlp in reproduce
HashirA123 Oct 25, 2025
20431a0
Changed a parameter to the loadData and loadModel
HashirA123 Oct 25, 2025
f5bb515
Modified Linear model to be skearn Linear
HashirA123 Oct 29, 2025
1857f39
resolved merge conflicts
HashirA123 Oct 29, 2025
451acdf
Fix formating and imports
HashirA123 Nov 2, 2025
7a459ef
added results to results.csv
HashirA123 Nov 3, 2025
7417a2e
Ran precommit hooks
HashirA123 Nov 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file added data/catalog/_data_main/_cached/sba_one_hot
Binary file not shown.
42 changes: 42 additions & 0 deletions data/catalog/_data_main/process_data/process_german_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,48 @@ def load_german_data():
return processed_df.astype("float64")


def load_german_data_modified():
# input vars
raw_data_file = os.path.join(os.path.dirname(__file__), "corrected_german.csv")
processed_file = os.path.join(os.path.dirname(__file__), "corrected_german_processed.csv")

# German Data Processing
raw_df = pd.read_csv(raw_data_file) # , index_col = 0)
processed_df = pd.DataFrame()

processed_df["GoodCustomer (label)"] = raw_df["credit_risk"]
processed_df["Sex"] = raw_df["personal_status_sex"]
processed_df["Age"] = raw_df["age"]
processed_df["Credit"] = raw_df["amount"]
processed_df["LoanDuration"] = raw_df["duration"]

# # order important, more balance can overwrite less balance!
# processed_df.loc[raw_df['CheckingAccountBalance_geq_0'] == 1, 'CheckingAccountBalance'] = 2
# processed_df.loc[raw_df['CheckingAccountBalance_geq_200'] == 1, 'CheckingAccountBalance'] = 3
# processed_df = processed_df.fillna(1) # all other categories...

# # order important, more balance can overwrite less balance!
# processed_df.loc[raw_df['SavingsAccountBalance_geq_100'] == 1, 'SavingsAccountBalance'] = 2
# processed_df.loc[raw_df['SavingsAccountBalance_geq_500'] == 1, 'SavingsAccountBalance'] = 3
# processed_df = processed_df.fillna(1) # all other categories...

# # 2: owns house, 1: rents house, 0: neither
# processed_df.loc[raw_df['OwnsHouse'] == 1, 'HousingStatus'] = 3
# processed_df.loc[raw_df['RentsHouse'] == 1, 'HousingStatus'] = 2
# processed_df = processed_df.fillna(1) # all other categories...

# Save to CSV
processed_df = processed_df + 0 # convert boolean values to numeric
processed_df = processed_df.reset_index(drop=True)
processed_df = (
processed_df.dropna()
) # drop all rows that include NAN (some exist in isMarried column, possibly elsewhere as well)
processed_df.to_csv(processed_file, header=True, index=False)
assert processed_df.shape[0] == 1000

return processed_df.astype("float64")


# import numpy as np
# import pandas as pd

Expand Down
196 changes: 196 additions & 0 deletions data/catalog/_data_main/process_data/process_sba_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import os
from random import seed
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from random import shuffle

import process_data.process_utils_data as ut

RANDOM_SEED = 54321
seed(
RANDOM_SEED
) # set the random seed so that the random permutations can be reproduced again

def get_feat_types(df):
cat_feat = []
num_feat = []
for key in list(df):
if df[key].dtype==object:
cat_feat.append(key)
elif len(set(df[key]))>2:
num_feat.append(key)
return cat_feat,num_feat

def load_sba_data():
# Define attributes of interest
attrs = [
'Zip', 'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp',
'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural',
'RevLineCr', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'New', 'RealEstate', 'Portion',
'Recession', 'daysterm', 'xx'
]
sensitive_attrs = [] # just an example, pick what matters for fairness
attrs_to_ignore = [] # IDs or very sparse high-cardinality

# Path to raw SBA file
this_files_directory = os.path.dirname(os.path.realpath(__file__))
file_name = os.path.join(this_files_directory, "..", "raw_data", "SBAcase.11.13.17.csv")

# Load file
df = pd.read_csv(file_name)
df = df.fillna(-1) # replace NaNs with sentinel
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# print(df['RevLineCr'].value_counts())

# Define target
y = 1 - df["Default"].values

# Dicts for storage
x_control = {}
attrs_to_vals = {}

for k in attrs:
if k in sensitive_attrs:
x_control[k] = df[k].tolist()
elif k in attrs_to_ignore:
pass
else:
attrs_to_vals[k] = df[k].tolist()

# Combine
all_attrs_to_vals = attrs_to_vals
for k in sensitive_attrs:
all_attrs_to_vals[k] = x_control[k]
all_attrs_to_vals["label"] = y

df_all = pd.DataFrame.from_dict(all_attrs_to_vals)

_, num_feat = get_feat_types(df_all)

# for key in num_feat:
# scaler = StandardScaler()
# df_all[key] = scaler.fit_transform(df_all[key].values.reshape(-1,1))

# ---- Create processed dataframe with integer encodings ----
processed_df = pd.DataFrame()

# Numeric attributes: keep directly
num_attrs = [
'Zip', 'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp',
'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural'
]
for a in num_attrs:
processed_df[a] = df_all[a]

# RevLineCr ("Y"/"N"/other) → 1,2,3
processed_df.loc[df_all["RevLineCr"] == "Y", "RevLineCr"] = 1
processed_df.loc[df_all["RevLineCr"] == "N", "RevLineCr"] = 2
processed_df.loc[df_all["RevLineCr"] == "T", "RevLineCr"] = 3
processed_df.loc[df_all["RevLineCr"] == "0", "RevLineCr"] = 4
# processed_df.loc[df_all["RevLineCr"] == -1, "RevLineCr"] = 5

# print(processed_df['RevLineCr'].value_counts())
# cant think of what to do, can just drop the Nas actaully.

# processed_df['RevLineCr'] = pd.Categorical(processed_df['RevLineCr'])

# Add recession, real estate, portion, etc. directly
for a in ['ChgOffDate', 'DisbursementDate', 'DisbursementGross',
'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'New', 'RealEstate', 'Portion',
'Recession', 'daysterm', 'xx']:
processed_df[a] = df_all[a]

processed_df["Label"] = df_all["label"]

processed_df = processed_df[processed_df["ApprovalFY"]<2006]

processed_df = processed_df[processed_df['RevLineCr'].notna()]

return processed_df.astype("float64")

def load_sba_data_modified():
# Define attributes of interest
attrs = [
'Zip', 'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp',
'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural',
'RevLineCr', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'New', 'RealEstate', 'Portion',
'Recession', 'daysterm', 'xx'
]
sensitive_attrs = [] # just an example, pick what matters for fairness
attrs_to_ignore = [] # IDs or very sparse high-cardinality

# Path to raw SBA file
this_files_directory = os.path.dirname(os.path.realpath(__file__))
file_name = os.path.join(this_files_directory, "..", "raw_data", "SBAcase.11.13.17.csv")

# Load file
df = pd.read_csv(file_name)
df = df.fillna(-1) # replace NaNs with sentinel
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)


# Define target
y = 1 - df["Default"].values

# Dicts for storage
x_control = {}
attrs_to_vals = {}

for k in attrs:
if k in sensitive_attrs:
x_control[k] = df[k].tolist()
elif k in attrs_to_ignore:
pass
else:
attrs_to_vals[k] = df[k].tolist()

# Combine
all_attrs_to_vals = attrs_to_vals
for k in sensitive_attrs:
all_attrs_to_vals[k] = x_control[k]
all_attrs_to_vals["label"] = y

df_all = pd.DataFrame.from_dict(all_attrs_to_vals)

_, num_feat = get_feat_types(df_all)

# for key in num_feat:
# scaler = StandardScaler()
# df_all[key] = scaler.fit_transform(df_all[key].values.reshape(-1,1))

# ---- Create processed dataframe with integer encodings ----
processed_df = pd.DataFrame()

# Numeric attributes: keep directly
num_attrs = [
'Zip', 'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp',
'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural'
]
for a in num_attrs:
processed_df[a] = df_all[a]

# RevLineCr ("Y"/"N"/other) → 1,2,3
processed_df.loc[df_all["RevLineCr"] == "Y", "RevLineCr"] = 1
processed_df.loc[df_all["RevLineCr"] == "N", "RevLineCr"] = 2
processed_df.loc[df_all["RevLineCr"] == "T", "RevLineCr"] = 3
processed_df.loc[df_all["RevLineCr"] == "0", "RevLineCr"] = 4
# processed_df.loc[df_all["RevLineCr"] == -1, "RevLineCr"] = 5
# cant think of what to do, can just drop the Nas actaully.

# processed_df['RevLineCr'] = pd.Categorical(processed_df['RevLineCr'])

# Add recession, real estate, portion, etc. directly
for a in ['ChgOffDate', 'DisbursementDate', 'DisbursementGross',
'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'New', 'RealEstate', 'Portion',
'Recession', 'daysterm', 'xx']:
processed_df[a] = df_all[a]

processed_df["Label"] = df_all["label"]

processed_df = processed_df[processed_df['RevLineCr'].notna()]

return processed_df.astype("float64")
Loading