1- import os , sys , importlib
1+ import os , sys
22import argparse
33import numpy as np
44import pandas as pd
55import matplotlib .pyplot as plt
66import mplhep as hep
77import yaml
8- import uproot
9-
10- from utils import plot_kinematic_features
8+ import logging
9+ import warnings
1110
11+ # Add source path for utils
1212sys .path .append ('../src' )
1313import nsbi_common_utils
14- from nsbi_common_utils import configuration
1514from nsbi_common_utils import datasets
1615
17- import logging
18- import warnings
1916# Suppress warnings
2017warnings .simplefilter (action = 'ignore' , category = FutureWarning )
2118
2522
2623hep .style .use (hep .style .ATLAS )
2724
25+ def load_config (path ):
26+ with open (path , "r" ) as f :
27+ return yaml .safe_load (f )
28+
2829def parse_args ():
29- parser = argparse .ArgumentParser (description = "Download and process HiggsML data for analysis ." )
30+ parser = argparse .ArgumentParser (description = "Process HiggsML data features ." )
3031
3132 parser .add_argument (
3233 "--config" ,
3334 type = str ,
34- default = './ config.yml' ,
35- help = "config file path "
35+ default = " config.pipeline.yaml" ,
36+ help = "Path to configuration file. "
3637 )
3738
38-
3939 return parser .parse_args ()
4040
41- def ds_helper (cfg , branches ):
41+ def ds_helper (cfg_path , branches ):
4242 '''
4343 Uses nsbi_common_utils.datasets to load data.
4444 '''
45- datasets_helper = nsbi_common_utils .datasets .datasets (config_path = cfg ,
46- branches_to_load = branches )
45+ datasets_helper = nsbi_common_utils .datasets .datasets (
46+ config_path = cfg_path ,
47+ branches_to_load = branches
48+ )
4749 return datasets_helper
4850
4951def process_data (df , input_features_by_jet , branches ):
5052 """Filters specific processes and balances the dataset."""
5153
5254 median_feature = {}
5355
54- for sample , sample_dataset in df ["Nominal" ].items ():
55-
56+ # 1. Calculate Medians from Nominal samples
57+ # Assuming df structure is Dict[Region, Dict[Sample, DataFrame]]
58+ # or Dict[Systematic, Dict[Sample, DataFrame]] based on nsbi loader
59+
60+ # We look for "Nominal" key usually used in nsbi utils
61+ if "Nominal" in df :
62+ nominal_data = df ["Nominal" ]
63+ else :
64+ # Fallback if the top level keys are not systematics but regions/samples directly
65+ # This depends on how load_systematics=True structures the output in nsbi_common_utils
66+ # Assuming standard structure:
67+ nominal_data = df .get ("Nominal" , df )
68+
69+ for sample , sample_dataset in nominal_data .items ():
5670 median_feature [sample ] = {}
5771
5872 for nJets , feat_list in input_features_by_jet .items ():
5973 for feature in feat_list :
60-
61- median_feature [sample ][feature ] = np .median (sample_dataset .loc [sample_dataset ['PRI_n_jets' ] >= nJets , feature ])
62-
63- logger .info (f"extracting additional branches from the engineered features" )
74+ # Calculate median for valid jet counts
75+ vals = sample_dataset .loc [sample_dataset ['PRI_n_jets' ] >= nJets , feature ]
76+ if len (vals ) > 0 :
77+ median_feature [sample ][feature ] = np .median (vals )
78+ else :
79+ median_feature [sample ][feature ] = 0.0
80+
81+ logger .info (f"Extracting additional branches from the engineered features" )
6482 branches_to_add = []
6583
84+ # 2. Apply Engineering to all datasets (Systematics/Regions)
6685 for region , sample_datasets in df .items ():
6786
6887 for sample , sample_dataset in sample_datasets .items ():
6988
89+ # --- Categorical Jet Masks ---
7090 sample_dataset ['njet_0' ] = (sample_dataset ['PRI_n_jets' ] == 0 ).astype (int )
7191 sample_dataset ['njet_1' ] = (sample_dataset ['PRI_n_jets' ] == 1 ).astype (int )
7292 sample_dataset ['njet_2' ] = (sample_dataset ['PRI_n_jets' ] >= 2 ).astype (int )
7393
74- branches_to_add += ['njet_0' , 'njet_1' , 'njet_2' ]
94+ for m in ['njet_0' , 'njet_1' , 'njet_2' ]:
95+ if m not in branches_to_add : branches_to_add .append (m )
7596
97+ # --- Per-Jet Masks and Imputation ---
7698 for i , feat_list in input_features_by_jet .items ():
77- mask_i = (sample_dataset ['PRI_n_jets' ] >= i ).astype (float )
78- sample_dataset [f'jet{ i } _mask' ] = mask_i
99+ # Create mask
100+ mask_col = f'jet{ i } _mask'
101+ sample_dataset [mask_col ] = (sample_dataset ['PRI_n_jets' ] >= i ).astype (float )
79102
80- branches_to_add += [ f'jet { i } _mask' ]
103+ if mask_col not in branches_to_add : branches_to_add . append ( mask_col )
81104
105+ # Impute
82106 for feat in feat_list :
83- sample_dataset [feat ] = sample_dataset [feat ].where (sample_dataset ['PRI_n_jets' ] >= i , median_feature [sample ][feat ])
107+ # Use median from Nominal sample if available, else 0
108+ med_val = median_feature .get (sample , {}).get (feat , 0 )
109+ sample_dataset [feat ] = sample_dataset [feat ].where (
110+ sample_dataset ['PRI_n_jets' ] >= i , med_val
111+ )
84112
85- for feat in branches .copy ():
113+ # --- Log Transformations ---
114+ for feat in branches :
115+ if feat not in sample_dataset .columns : continue
86116
87117 kin = sample_dataset [feat ].to_numpy ()
88-
89- if (np .amin (kin ) > 0.0 ) and (np .amax (kin )> 100 ):
90- log_feat = 'log_' + feat
91- sample_dataset [log_feat ] = np .log (kin + 10.0 )
118+ if len (kin ) == 0 : continue
119+
120+ if (np .amin (kin ) > 0.0 ) and (np .amax (kin ) > 100 ):
121+ log_feat = 'log_' + feat
122+ sample_dataset [log_feat ] = np .log (kin + 10.0 )
92123
93124 if log_feat not in branches_to_add :
94- branches_to_add += [ log_feat ]
125+ branches_to_add . append ( log_feat )
95126
96127 df [region ][sample ] = sample_dataset
97128
@@ -101,47 +132,48 @@ def process_data(df, input_features_by_jet, branches):
101132def main ():
102133 args = parse_args ()
103134
104- # Specify branches to load from the ROOT ntuples
105- input_features_noJets = ['PRI_lep_pt' , 'PRI_lep_eta' , 'PRI_lep_phi' , 'PRI_had_pt' , 'PRI_had_eta' ,
106- 'PRI_had_phi' , 'PRI_met' , 'PRI_met_phi' , 'DER_mass_transverse_met_lep' ,
107- 'DER_mass_vis' , 'DER_pt_h' , 'DER_deltar_had_lep' , 'DER_pt_tot' , 'DER_sum_pt' ,
108- 'DER_pt_ratio_lep_had' , 'DER_met_phi_centrality' ]
109-
110- input_features_1Jets = ['PRI_jet_leading_pt' , 'PRI_jet_leading_eta' ,
111- 'PRI_jet_leading_phi' ,
112- 'PRI_jet_all_pt' ]
113-
114- input_features_2Jets = ['PRI_jet_subleading_pt' ,
115- 'PRI_jet_subleading_eta' , 'PRI_jet_subleading_phi' , 'DER_deltaeta_jet_jet' , 'DER_mass_jet_jet' ,
116- 'DER_prodeta_jet_jet' ,
117- 'DER_lep_eta_centrality' ]
118-
119- input_features_nJets = ['PRI_n_jets' ]
120-
121- branches_to_load = input_features_noJets \
122- + input_features_1Jets \
123- + input_features_2Jets \
124- + input_features_nJets
135+ cfg_full = load_config (args .config )
136+
137+ if "data_preprocessing" not in cfg_full :
138+ raise KeyError ("Config file missing 'data_preprocessing' section." )
139+
140+ feats = cfg_full ["data_preprocessing" ]["features" ]
141+
142+ input_features_noJets = feats ["no_jets" ]
143+ input_features_1Jets = feats ["one_jet" ]
144+ input_features_2Jets = feats ["two_jets" ]
145+ input_features_nJets = feats ["n_jets" ]
146+
147+ branches_to_load = (input_features_noJets +
148+ input_features_1Jets +
149+ input_features_2Jets +
150+ input_features_nJets )
151+
125152 input_features_by_jet = {
126- 1 : input_features_1Jets ,
127- 2 : input_features_2Jets
153+ 1 : input_features_1Jets ,
154+ 2 : input_features_2Jets
128155 }
129156
130- # Execution Flow
131157 try :
132158 logger .info (f"Loading and converting the dataset to Pandas DataFrame for processing..." )
133- datasets_helper = ds_helper (args .config , branches_to_load )
134- datasets_all = datasets_helper .load_datasets_from_config (load_systematics = True )
159+
160+ datasets_helper = ds_helper (cfg_full ['config_path' ], branches_to_load )
161+
162+ datasets_all = datasets_helper .load_datasets_from_config (load_systematics = True )
135163
136- datasets_all , add_branches = process_data (datasets_all , input_features_by_jet ,
137- branches = branches_to_load )
164+ datasets_all , add_branches = process_data (
165+ datasets_all ,
166+ input_features_by_jet ,
167+ branches = branches_to_load
168+ )
138169
139- logger .info (f"adding additional branches to the DataFrame" )
170+ logger .info (f"Adding additional branches to the DataFrame: { len ( add_branches ) } new features " )
140171 datasets_helper .add_appended_branches (add_branches )
141172
142- datasets_helper .save_datasets (datasets_all ,
143- save_systematics = True )
144-
173+ datasets_helper .save_datasets (
174+ datasets_all ,
175+ save_systematics = True
176+ )
145177
146178 logger .info ("Data Preprocessing workflow completed successfully." )
147179
@@ -150,4 +182,4 @@ def main():
150182 raise
151183
152184if __name__ == "__main__" :
153- main ()
185+ main ()
0 commit comments