add config-driven scripts/data_preprocessing script and move feature definitions to config.pipeline.yaml

gsharma99 · gsharma99 · commit fb4656f01a67 · 2026-02-05T11:36:07.000+09:00
diff --git a/FAIR_universe_Higgs_tautau/config.pipeline.yaml b/FAIR_universe_Higgs_tautau/config.pipeline.yaml
@@ -31,3 +31,44 @@ data_loader:
   output:
     dir: "./saved_datasets"
 
+# Section for Data Preprocessing
+data_preprocessing:
+  features:
+    
+    no_jets:
+      - PRI_lep_pt
+      - PRI_lep_eta
+      - PRI_lep_phi
+      - PRI_had_pt
+      - PRI_had_eta
+      - PRI_had_phi
+      - PRI_met
+      - PRI_met_phi
+      - DER_mass_transverse_met_lep
+      - DER_mass_vis
+      - DER_pt_h
+      - DER_deltar_had_lep
+      - DER_pt_tot
+      - DER_sum_pt
+      - DER_pt_ratio_lep_had
+      - DER_met_phi_centrality
+
+    one_jet:
+      - PRI_jet_leading_pt
+      - PRI_jet_leading_eta
+      - PRI_jet_leading_phi
+      - PRI_jet_all_pt
+
+    two_jets:
+      - PRI_jet_subleading_pt
+      - PRI_jet_subleading_eta
+      - PRI_jet_subleading_phi
+      - DER_deltaeta_jet_jet
+      - DER_mass_jet_jet
+      - DER_prodeta_jet_jet
+      - DER_lep_eta_centrality
+
+
+    n_jets:
+      - PRI_n_jets
+
diff --git a/FAIR_universe_Higgs_tautau/scripts/data_preprocessing.py b/FAIR_universe_Higgs_tautau/scripts/data_preprocessing.py
@@ -1,21 +1,18 @@
-import os, sys, importlib
+import os, sys
 import argparse
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import mplhep as hep
 import yaml
-import uproot
-
-from utils import plot_kinematic_features
+import logging
+import warnings
 
+# Add source path for utils
 sys.path.append('../src')
 import nsbi_common_utils
-from nsbi_common_utils import configuration
 from nsbi_common_utils import datasets
 
-import logging
-import warnings
 # Suppress warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
@@ -25,73 +22,107 @@
 
 hep.style.use(hep.style.ATLAS)
 
+def load_config(path):
+    with open(path, "r") as f:
+        return yaml.safe_load(f)
+
 def parse_args():
-    parser = argparse.ArgumentParser(description="Download and process HiggsML data for analysis.")
+    parser = argparse.ArgumentParser(description="Process HiggsML data features.")
     
     parser.add_argument(
         "--config", 
         type=str, 
-        default='./config.yml',
-        help="config file path"
+        default="config.pipeline.yaml",
+        help="Path to configuration file."
     )
     
-    
     return parser.parse_args()
 
-def ds_helper(cfg, branches):
+def ds_helper(cfg_path, branches):
     '''
     Uses nsbi_common_utils.datasets to load data.
     '''
-    datasets_helper = nsbi_common_utils.datasets.datasets(config_path = cfg,
-                                                branches_to_load = branches)
+    datasets_helper = nsbi_common_utils.datasets.datasets(
+        config_path=cfg_path,
+        branches_to_load=branches
+    )
     return datasets_helper
 
 def process_data(df, input_features_by_jet, branches):
     """Filters specific processes and balances the dataset."""
     
     median_feature = {}
 
-    for sample, sample_dataset in df["Nominal"].items(): 
-
+    # 1. Calculate Medians from Nominal samples
+    # Assuming df structure is Dict[Region, Dict[Sample, DataFrame]]
+    # or Dict[Systematic, Dict[Sample, DataFrame]] based on nsbi loader
+    
+    # We look for "Nominal" key usually used in nsbi utils
+    if "Nominal" in df:
+        nominal_data = df["Nominal"]
+    else:
+        # Fallback if the top level keys are not systematics but regions/samples directly
+        # This depends on how load_systematics=True structures the output in nsbi_common_utils
+        # Assuming standard structure:
+        nominal_data = df.get("Nominal", df)
+
+    for sample, sample_dataset in nominal_data.items(): 
         median_feature[sample] = {}
 
         for nJets, feat_list in input_features_by_jet.items():
             for feature in feat_list:
-
-                median_feature[sample][feature] = np.median(sample_dataset.loc[sample_dataset['PRI_n_jets'] >= nJets, feature])
-
-    logger.info(f"extracting additional branches from the engineered features") 
+                # Calculate median for valid jet counts
+                vals = sample_dataset.loc[sample_dataset['PRI_n_jets'] >= nJets, feature]
+                if len(vals) > 0:
+                    median_feature[sample][feature] = np.median(vals)
+                else:
+                    median_feature[sample][feature] = 0.0
+
+    logger.info(f"Extracting additional branches from the engineered features") 
     branches_to_add = []
 
+    # 2. Apply Engineering to all datasets (Systematics/Regions)
     for region, sample_datasets in df.items():
 
         for sample, sample_dataset in sample_datasets.items():   
             
+            # --- Categorical Jet Masks ---
             sample_dataset['njet_0'] = (sample_dataset['PRI_n_jets'] == 0).astype(int)
             sample_dataset['njet_1'] = (sample_dataset['PRI_n_jets'] == 1).astype(int)
             sample_dataset['njet_2'] = (sample_dataset['PRI_n_jets'] >= 2).astype(int)
 
-            branches_to_add += ['njet_0', 'njet_1', 'njet_2']
+            for m in ['njet_0', 'njet_1', 'njet_2']:
+                if m not in branches_to_add: branches_to_add.append(m)
 
+            # --- Per-Jet Masks and Imputation ---
             for i, feat_list in input_features_by_jet.items():
-                mask_i = (sample_dataset['PRI_n_jets'] >= i).astype(float)
-                sample_dataset[f'jet{i}_mask'] = mask_i
+                # Create mask
+                mask_col = f'jet{i}_mask'
+                sample_dataset[mask_col] = (sample_dataset['PRI_n_jets'] >= i).astype(float)
 
-                branches_to_add += [f'jet{i}_mask']
+                if mask_col not in branches_to_add: branches_to_add.append(mask_col)
 
+                # Impute
                 for feat in feat_list:
-                    sample_dataset[feat] = sample_dataset[feat].where(sample_dataset['PRI_n_jets'] >= i, median_feature[sample][feat])
+                    # Use median from Nominal sample if available, else 0
+                    med_val = median_feature.get(sample, {}).get(feat, 0)
+                    sample_dataset[feat] = sample_dataset[feat].where(
+                        sample_dataset['PRI_n_jets'] >= i, med_val
+                    )
 
-            for feat in branches.copy():
+            # --- Log Transformations ---
+            for feat in branches:
+                if feat not in sample_dataset.columns: continue
 
                 kin = sample_dataset[feat].to_numpy()
-                
-                if (np.amin(kin) > 0.0) and (np.amax(kin)>100):
-                    log_feat = 'log_'+feat
-                    sample_dataset[log_feat] = np.log(kin+10.0)
+                if len(kin) == 0: continue
+
+                if (np.amin(kin) > 0.0) and (np.amax(kin) > 100):
+                    log_feat = 'log_' + feat
+                    sample_dataset[log_feat] = np.log(kin + 10.0)
 
                     if log_feat not in branches_to_add:
-                        branches_to_add  += [log_feat]
+                        branches_to_add.append(log_feat)
 
             df[region][sample] = sample_dataset
     
@@ -101,47 +132,48 @@ def process_data(df, input_features_by_jet, branches):
 def main():
     args = parse_args()
     
-    # Specify branches to load from the ROOT ntuples
-    input_features_noJets = ['PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_had_pt', 'PRI_had_eta',
-        'PRI_had_phi', 'PRI_met', 'PRI_met_phi', 'DER_mass_transverse_met_lep',
-        'DER_mass_vis', 'DER_pt_h', 'DER_deltar_had_lep', 'DER_pt_tot', 'DER_sum_pt',
-        'DER_pt_ratio_lep_had', 'DER_met_phi_centrality']
-
-    input_features_1Jets = ['PRI_jet_leading_pt', 'PRI_jet_leading_eta',
-        'PRI_jet_leading_phi',
-        'PRI_jet_all_pt']
-
-    input_features_2Jets = ['PRI_jet_subleading_pt',
-        'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
-        'DER_prodeta_jet_jet',
-        'DER_lep_eta_centrality']
-
-    input_features_nJets = ['PRI_n_jets']
-
-    branches_to_load = input_features_noJets \
-                        + input_features_1Jets \
-                        + input_features_2Jets \
-                        + input_features_nJets
+    cfg_full = load_config(args.config)
+    
+    if "data_preprocessing" not in cfg_full:
+        raise KeyError("Config file missing 'data_preprocessing' section.")
+        
+    feats = cfg_full["data_preprocessing"]["features"]
+
+    input_features_noJets = feats["no_jets"]
+    input_features_1Jets  = feats["one_jet"]
+    input_features_2Jets  = feats["two_jets"]
+    input_features_nJets  = feats["n_jets"]
+
+    branches_to_load = (input_features_noJets + 
+                        input_features_1Jets + 
+                        input_features_2Jets + 
+                        input_features_nJets)
+    
     input_features_by_jet = {
-        1   :   input_features_1Jets, 
-        2   :   input_features_2Jets
+        1 : input_features_1Jets, 
+        2 : input_features_2Jets
     }
 
-    # Execution Flow
     try:
         logger.info(f"Loading and converting the dataset to Pandas DataFrame for processing...")
-        datasets_helper = ds_helper(args.config, branches_to_load)
-        datasets_all = datasets_helper.load_datasets_from_config(load_systematics = True)
+        
+        datasets_helper = ds_helper(cfg_full['config_path'], branches_to_load)
+        
+        datasets_all = datasets_helper.load_datasets_from_config(load_systematics=True)
 
-        datasets_all, add_branches = process_data(datasets_all, input_features_by_jet, 
-                                                     branches=branches_to_load)
+        datasets_all, add_branches = process_data(
+            datasets_all, 
+            input_features_by_jet, 
+            branches=branches_to_load
+        )
         
-        logger.info(f"adding additional branches to the DataFrame")
+        logger.info(f"Adding additional branches to the DataFrame: {len(add_branches)} new features")
         datasets_helper.add_appended_branches(add_branches)
 
-        datasets_helper.save_datasets(datasets_all, 
-                        save_systematics = True)
-        
+        datasets_helper.save_datasets(
+            datasets_all, 
+            save_systematics=True
+        )
         
         logger.info("Data Preprocessing workflow completed successfully.")
 
@@ -150,4 +182,4 @@ def main():
         raise
 
 if __name__ == "__main__":
-    main()
+    main()