aadhaar-biometric-forecast/training_model.py at main · nuemaan/aadhaar-biometric-forecast · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
print("--- 1. LOADING DATASETS ---")

def load_all_csvs(folder_path):
    files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not files: return None
    # Reads all CSVs in the folder and combines them
    return pd.concat([pd.read_csv(f, low_memory=False) for f in files], ignore_index=True)

df_bio = load_all_csvs('api_data_aadhar_biometric')
df_enrol = load_all_csvs('api_data_aadhar_enrolment')

# Convert Dates & Filter Bad Data
df_bio['date'] = pd.to_datetime(df_bio['date'], dayfirst=True, errors='coerce')
df_enrol['date'] = pd.to_datetime(df_enrol['date'], dayfirst=True, errors='coerce')
df_bio.dropna(subset=['date'], inplace=True)
df_enrol.dropna(subset=['date'], inplace=True)

# Create 'Month-Year' for merging
df_bio['month_year'] = df_bio['date'].dt.to_period('M')
df_enrol['month_year'] = df_enrol['date'].dt.to_period('M')

# Group by State (Winner Strategy: State-Level is more accurate than District-Level)
bio_grouped = df_bio.groupby(['state', 'month_year'])[['bio_age_5_17']].sum().reset_index()
enrol_grouped = df_enrol.groupby(['state', 'month_year'])[['age_5_17', 'age_0_5']].sum().reset_index()

data = pd.merge(bio_grouped, enrol_grouped, on=['state', 'month_year'], how='inner')
data['month'] = data['month_year'].dt.month
data = data.sort_values(by=['state', 'month_year'])

# ==========================================
# 2. FEATURE ENGINEERING (The "Secret Sauce")
# ==========================================
print("--- 2. ENGINEERING FEATURES ---")

# LAG-1: The most powerful predictor.
# "If updates were high last month, they will likely be high this month."
data['updates_last_month'] = data.groupby('state')['bio_age_5_17'].shift(1)
data = data.dropna()

print(f"✅ Training Data Ready: {len(data)} samples")

# ==========================================
# 3. TRAIN CHAMPION MODEL
# ==========================================
print("--- 3. TRAINING MODEL ---")

X = data[['month', 'age_5_17', 'age_0_5', 'updates_last_month']]
y = data['bio_age_5_17']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Model: Histogram-based Gradient Boosting (The Winner)
model = HistGradientBoostingRegressor(
    max_iter=1000,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"\n🏆 FINAL ACCURACY (R² Score): {r2:.4f}")
print(f"📉 Average Error: +/- {int(mae)} updates")

# ==========================================
# 4. SAVE VISUALIZATION
# ==========================================
plt.figure(figsize=(12, 6))
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions}).reset_index(drop=True)
plt.plot(results['Actual'], label='Actual Demand', alpha=0.7)
plt.plot(results['Predicted'], label='AI Forecast', linestyle='--', linewidth=2)
plt.title(f"Aadhaar Biometric Forecast (Accuracy: {r2:.2f})")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig("Champion_Model_Performance.png")
print("✅ Graph saved as 'Champion_Model_Performance.png'")