Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4bb4e1d
fix: gravitational reproduce bug
zkhotanlou Sep 14, 2025
0566741
fix: claproar flaky test
zkhotanlou Sep 14, 2025
f58d042
fix: github workflow updates
zkhotanlou Sep 14, 2025
b0b85fb
adding reproduce for ROAR, some kinks seem to presists, need fixing
HashirA123 Oct 3, 2025
33c932d
Getting results with the Linear model
HashirA123 Oct 22, 2025
11bbbe2
Shifted approach to reproduction
HashirA123 Oct 24, 2025
2b80eed
Merge pull request #23 from zkhotanlou/zahra/fix-bug
zkhotanlou Oct 25, 2025
45ae92f
reduced duplication in loading german data
HashirA123 Oct 25, 2025
5863322
Reproduction with german dataset on ROAR with LR
HashirA123 Oct 25, 2025
e72b7df
Seperate asserts for linear and mlp in reproduce
HashirA123 Oct 25, 2025
2223fa2
Changed a parameter to the loadData and loadModel
HashirA123 Oct 25, 2025
8c28db3
adding reproduce for ROAR, some kinks seem to presists, need fixing
HashirA123 Oct 3, 2025
2c806d2
Getting results with the Linear model
HashirA123 Oct 22, 2025
eff95e6
Shifted approach to reproduction
HashirA123 Oct 24, 2025
4ac5a78
reduced duplication in loading german data
HashirA123 Oct 25, 2025
488dfb9
Reproduction with german dataset on ROAR with LR
HashirA123 Oct 25, 2025
294ae01
Seperate asserts for linear and mlp in reproduce
HashirA123 Oct 25, 2025
20431a0
Changed a parameter to the loadData and loadModel
HashirA123 Oct 25, 2025
f5bb515
Modified Linear model to be skearn Linear
HashirA123 Oct 29, 2025
1857f39
resolved merge conflicts
HashirA123 Oct 29, 2025
451acdf
Fix formating and imports
HashirA123 Nov 2, 2025
7a459ef
added results to results.csv
HashirA123 Nov 3, 2025
7417a2e
Ran precommit hooks
HashirA123 Nov 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions .github/workflows/pre-commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@ on: [pull_request]

jobs:
pre-commit:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- name: Check out the code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
- name: Set up Python 3.7
uses: actions/setup-python@v5
with:
python-version: '3.7'
cache: 'pip'

- name: Install dependencies
- name: Install project deps (same as local)
run: |
python -m pip install -U pip setuptools wheel
pip install -r requirements-dev.txt
pip install -e .
pip install pre-commit
pre-commit install-hooks

- name: Run pre-commit hooks
run: pre-commit run --all-files
run: pre-commit run --all-files --show-diff-on-failure
56 changes: 29 additions & 27 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,32 @@ repos:
entry: python tools/clean_pytest_cache.py
language: system
types: [python]
- repo: local
hooks:
- id: check-new-methods-structure
name: Check new_methods Folder Structure and Run Unit Tests
entry: python tools/folder_structure_check.py
language: python
types: [python]
additional_dependencies:
- causalgraphicalmodels==0.0.4
- dice-ml==0.5
- ipython==7.16.0
- h5py==2.10.0
- keras==2.3.0
- lime==0.2.0.1
- matplotlib
- mip==1.12.0
- pandas
- protobuf<=3.21
- PySMT==0.9.5
- pytest==6.1.2
- pyyaml
- recourse==1.0.0
- scikit-learn==0.23.2
- tensorflow==1.14.0
- torch
- torchvision
- xgboost==1.4.2
# - repo: local
# hooks:
# - id: check-new-methods-structure
# name: Check new_methods Folder Structure and Run Unit Tests
# entry: python tools/folder_structure_check.py
# pass_filenames: false
# language: python
# types: [python]
# additional_dependencies:
# - .
# - causalgraphicalmodels==0.0.4
# - dice-ml==0.5
# - ipython==7.16.0
# - h5py==2.10.0
# - keras==2.3.0
# - lime==0.2.0.1
# - matplotlib
# - mip==1.12.0
# - pandas
# - protobuf<=3.21
# - PySMT==0.9.5
# - pytest==6.1.2
# - pyyaml
# - recourse==1.0.0
# - scikit-learn==0.23.2
# - tensorflow==1.14.0
# - torch
# - torchvision
# - xgboost==1.4.2
2 changes: 1 addition & 1 deletion assignment/assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from methods.processing import create_hash_dataframe
from models.catalog import ModelCatalog
from models.negative_instances import predict_negative_instances
from tools.logging import log
from tools.log import log

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.simplefilter(action="ignore", category=FutureWarning)
Expand Down
Binary file not shown.
Binary file removed data/catalog/_data_main/_cached/german_non_hot
Binary file not shown.
Binary file modified data/catalog/_data_main/_cached/german_one_hot
Binary file not shown.
Binary file not shown.
Binary file added data/catalog/_data_main/_cached/sba_one_hot
Binary file not shown.
44 changes: 32 additions & 12 deletions data/catalog/_data_main/process_data/process_german_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,44 @@
np.random.seed(RANDOM_SEED)


def load_german_data():
def load_german_data(modified=False):
# input vars
raw_data_file = os.path.join(os.path.dirname(__file__), "german_raw.csv")
processed_file = os.path.join(os.path.dirname(__file__), "german_processed.csv")
this_files_directory = os.path.dirname(os.path.realpath(__file__))
if modified is False:
raw_data_file = os.path.join(
this_files_directory, "..", "raw_data", "german_v1.csv"
)
processed_file = os.path.join(
this_files_directory, "..", "raw_data", "german_v1_processed.csv"
)
else:
raw_data_file = os.path.join(
this_files_directory, "..", "raw_data", "german_v2.csv"
)
processed_file = os.path.join(
this_files_directory, "..", "raw_data", "german_v2_processed.csv"
)

# German Data Processing
raw_df = pd.read_csv(raw_data_file) # , index_col = 0)
processed_df = pd.DataFrame()

processed_df["GoodCustomer (label)"] = raw_df["GoodCustomer"]
processed_df["GoodCustomer (label)"] = (
processed_df["GoodCustomer (label)"] + 1
) / 2
processed_df.loc[raw_df["Gender"] == "Male", "Sex"] = 1
processed_df.loc[raw_df["Gender"] == "Female", "Sex"] = 0
processed_df["Age"] = raw_df["Age"]
processed_df["Credit"] = raw_df["Credit"]
processed_df["LoanDuration"] = raw_df["LoanDuration"]
# if modified == False:
# processed_df["GoodCustomer (label)"] = raw_df["GoodCustomer"]
# processed_df["GoodCustomer (label)"] = (
# processed_df["GoodCustomer (label)"] + 1
# ) / 2
# processed_df.loc[raw_df["Gender"] == "Male", "Sex"] = 1
# processed_df.loc[raw_df["Gender"] == "Female", "Sex"] = 0
# processed_df["Age"] = raw_df["Age"]
# processed_df["Credit"] = raw_df["Credit"]
# processed_df["LoanDuration"] = raw_df["LoanDuration"]
# else:
processed_df["GoodCustomer (label)"] = raw_df["credit_risk"]
processed_df["Sex"] = raw_df["personal_status_sex"]
processed_df["Age"] = raw_df["age"]
processed_df["Credit"] = raw_df["amount"]
processed_df["LoanDuration"] = raw_df["duration"]

# # order important, more balance can overwrite less balance!
# processed_df.loc[raw_df['CheckingAccountBalance_geq_0'] == 1, 'CheckingAccountBalance'] = 2
Expand Down
152 changes: 152 additions & 0 deletions data/catalog/_data_main/process_data/process_sba_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import os
from random import seed

import pandas as pd

RANDOM_SEED = 54321
seed(
RANDOM_SEED
) # set the random seed so that the random permutations can be reproduced again


def get_feat_types(df):
cat_feat = []
num_feat = []
for key in list(df):
if df[key].dtype == object:
cat_feat.append(key)
elif len(set(df[key])) > 2:
num_feat.append(key)
return cat_feat, num_feat


def load_sba_data(modified=False):
# Define attributes of interest
attrs = [
"Zip",
"NAICS",
"ApprovalDate",
"ApprovalFY",
"Term",
"NoEmp",
"NewExist",
"CreateJob",
"RetainedJob",
"FranchiseCode",
"UrbanRural",
"RevLineCr",
"ChgOffDate",
"DisbursementDate",
"DisbursementGross",
"ChgOffPrinGr",
"GrAppv",
"SBA_Appv",
"New",
"RealEstate",
"Portion",
"Recession",
"daysterm",
"xx",
]
sensitive_attrs = [] # just an example, pick what matters for fairness
attrs_to_ignore = [] # IDs or very sparse high-cardinality

# Path to raw SBA file
this_files_directory = os.path.dirname(os.path.realpath(__file__))
file_name = os.path.join(
this_files_directory, "..", "raw_data", "SBAcase.11.13.17.csv"
)

# Load file
df = pd.read_csv(file_name)
df = df.fillna(-1) # replace NaNs with sentinel
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# print(df['RevLineCr'].value_counts())

# Define target
y = 1 - df["Default"].values

# Dicts for storage
x_control = {}
attrs_to_vals = {}

for k in attrs:
if k in sensitive_attrs:
x_control[k] = df[k].tolist()
elif k in attrs_to_ignore:
pass
else:
attrs_to_vals[k] = df[k].tolist()

# Combine
all_attrs_to_vals = attrs_to_vals
for k in sensitive_attrs:
all_attrs_to_vals[k] = x_control[k]
all_attrs_to_vals["label"] = y

df_all = pd.DataFrame.from_dict(all_attrs_to_vals)

_, num_feat = get_feat_types(df_all)

# for key in num_feat:
# scaler = StandardScaler()
# df_all[key] = scaler.fit_transform(df_all[key].values.reshape(-1,1))

# ---- Create processed dataframe with integer encodings ----
processed_df = pd.DataFrame()

# Numeric attributes: keep directly
num_attrs = [
"Zip",
"NAICS",
"ApprovalDate",
"ApprovalFY",
"Term",
"NoEmp",
"NewExist",
"CreateJob",
"RetainedJob",
"FranchiseCode",
"UrbanRural",
]
for a in num_attrs:
processed_df[a] = df_all[a]

# RevLineCr ("Y"/"N"/other) → 1,2,3
processed_df.loc[df_all["RevLineCr"] == "Y", "RevLineCr"] = 1
processed_df.loc[df_all["RevLineCr"] == "N", "RevLineCr"] = 2
processed_df.loc[df_all["RevLineCr"] == "T", "RevLineCr"] = 3
processed_df.loc[df_all["RevLineCr"] == "0", "RevLineCr"] = 4
# processed_df.loc[df_all["RevLineCr"] == -1, "RevLineCr"] = 5

# print(processed_df['RevLineCr'].value_counts())
# cant think of what to do, can just drop the Nas actaully.

# processed_df['RevLineCr'] = pd.Categorical(processed_df['RevLineCr'])

# Add recession, real estate, portion, etc. directly
for a in [
"ChgOffDate",
"DisbursementDate",
"DisbursementGross",
"ChgOffPrinGr",
"GrAppv",
"SBA_Appv",
"New",
"RealEstate",
"Portion",
"Recession",
"daysterm",
"xx",
]:
processed_df[a] = df_all[a]

processed_df["Label"] = df_all["label"]

if modified is False:
processed_df = processed_df[processed_df["ApprovalFY"] < 2006]

processed_df = processed_df[processed_df["RevLineCr"].notna()]

return processed_df.astype("float64")
Loading