datakind · vishpillai123 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/.github/workflows/release-integration.yml b/.github/workflows/release-integration.yml
@@ -104,5 +104,5 @@ jobs:
             --var "databricks_institution_name=synthetic_integration" \
             --var "datakind_notification_email=${{ secrets.DATAKIND_EMAIL }}" \
             --var "DK_CC_EMAIL=${{ secrets.DATAKIND_EMAIL }}" \
-            --params "model_name=synthetic_integration_retention_2_year_time_first_within_cohort,model_type=h2o,config_file_name=${CONFIG_FILE},job_type=inference,db_run_id=${DB_RUN_ID_PREFIX}_inference"
+            --params "model_name=synthetic_integration_retention_2_year_time_first_within_cohort,config_file_name=${CONFIG_FILE},job_type=inference,db_run_id=${DB_RUN_ID_PREFIX}_inference"
 
diff --git a/.github/workflows/weekly-develop-integration.yml b/.github/workflows/weekly-develop-integration.yml
@@ -107,4 +107,4 @@ jobs:
             --var "databricks_institution_name=synthetic_integration" \
             --var "datakind_notification_email=${{ secrets.DATAKIND_EMAIL }}" \
             --var "DK_CC_EMAIL=${{ secrets.DATAKIND_EMAIL }}" \
-            --params "model_name=synthetic_integration_retention_2_year_time_first_within_cohort,model_type=h2o,config_file_name=${CONFIG_FILE},job_type=inference,db_run_id=${DB_RUN_ID_PREFIX}_inference"
+            --params "model_name=synthetic_integration_retention_2_year_time_first_within_cohort,config_file_name=${CONFIG_FILE},job_type=inference,db_run_id=${DB_RUN_ID_PREFIX}_inference"
diff --git a/configs/custom/config-TEMPLATE.toml b/configs/custom/config-TEMPLATE.toml
diff --git a/configs/custom_h2o/config-TEMPLATE.toml b/configs/custom_h2o/config-TEMPLATE.toml
@@ -72,7 +72,6 @@ predict_table_path = "CATALOG.INST_ID_gold.INST_ID_predictions"
 [model]
 experiment_id = "EXPERIMENT_ID"
 run_id = "RUN_ID"
-framework = "h2o"
 calibrate = false
 
 [preprocessing]

diff --git a/...s/pdp/config-CREDITS_EARNED_TEMPLATE.toml → ...p_h2o/config-CREDITS_EARNED_TEMPLATE.toml b/...s/pdp/config-CREDITS_EARNED_TEMPLATE.toml → ...p_h2o/config-CREDITS_EARNED_TEMPLATE.toml
@@ -19,7 +19,6 @@ raw_cohort = "FILE_NAME_COHORT.csv"
 [model]
 experiment_id = "EXPERIMENT_ID"
 run_id = "RUN_ID"
-framework = "sklearn"
 
 [preprocessing]
 splits = { train = 0.6, test = 0.2, validate = 0.2 }

diff --git a/configs/pdp/config-GRADUATION_TEMPLATE.toml → ...s/pdp_h2o/config-GRADUATION_TEMPLATE.toml b/configs/pdp/config-GRADUATION_TEMPLATE.toml → ...s/pdp_h2o/config-GRADUATION_TEMPLATE.toml
@@ -19,7 +19,6 @@ raw_cohort = "FILE_NAME_COHORT.csv"
 [model]
 experiment_id = "EXPERIMENT_ID"
 run_id = "RUN_ID"
-framework = "sklearn"
 
 [preprocessing]
 splits = { train = 0.6, test = 0.2, validate = 0.2 }

diff --git a/configs/pdp/config-RETENTION_TEMPLATE.toml → ...gs/pdp_h2o/config-RETENTION_TEMPLATE.toml b/configs/pdp/config-RETENTION_TEMPLATE.toml → ...gs/pdp_h2o/config-RETENTION_TEMPLATE.toml
@@ -19,18 +19,17 @@ raw_cohort = "FILE_NAME_COHORT.csv"
 [model]
 experiment_id = "EXPERIMENT_ID"
 run_id = "RUN_ID"
-framework = "sklearn"
+calibrate_underpred = false
 
 [preprocessing]
 splits = { train = 0.6, test = 0.2, validate = 0.2 }
 sample_class_weight = "balanced"
-# TODO: change to true if school wants otherwise
 include_pre_cohort_courses = false
 
 [preprocessing.features]
 min_passing_grade = 1.0
 min_num_credits_full_time = 12
-course_level_pattern = '^(?:[A-Z_-]+)?(?P<course_level>\d)\d{2}(?:-?[A-Z\d]+)?$'
+course_level_pattern =  '^(?:[A-Z_-]+)?(?P<course_level>\d)\d{2}(?:-?[A-Z\d]+)?$'
 core_terms = ["FALL", "SPRING"]
 key_course_subject_areas = []
 key_course_ids = []
@@ -59,9 +58,9 @@ collinear_threshold = 10.0
 # force_include_cols = []
 
 [modeling.training]
-primary_metric = "log_loss"
+primary_metric = "logloss"
 timeout_minutes = 10
-# exclude_frameworks = ["xgboost", "lightgbm"]
+# exclude_frameworks = ["XGBoost", "GBM"]
 # exclude_cols = []
 
 [modeling.evaluation]
@@ -70,4 +69,5 @@ topn_runs_included = 5
 [inference]
 num_top_features = 5
 min_prob_pos_label = 0.5
+background_data_sample = 500
 cohort = ['fall 2024-25']
diff --git a/configs/pdp_h2o/config-TEMPLATE.toml b/configs/pdp_h2o/config-TEMPLATE.toml
@@ -36,9 +36,6 @@ variables:
   model_name:
     description: "Name of the model once it's registered and ready to be used for inference"
     default: ""
-  model_type:
-    description: "Type of model; sklearn vs. h2o"
-    default: "h2o"
 
 run_as:
   service_principal_name: ${var.ds_run_as}