Skip to content

Commit fb4656f

Browse files
committed
add config-driven scripts/data_preprocessing script and move feature definitions to config.pipeline.yaml
1 parent 9c18a15 commit fb4656f

File tree

2 files changed

+137
-64
lines changed

2 files changed

+137
-64
lines changed

FAIR_universe_Higgs_tautau/config.pipeline.yaml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,44 @@ data_loader:
3131
output:
3232
dir: "./saved_datasets"
3333

34+
# Section for Data Preprocessing
35+
data_preprocessing:
36+
features:
37+
38+
no_jets:
39+
- PRI_lep_pt
40+
- PRI_lep_eta
41+
- PRI_lep_phi
42+
- PRI_had_pt
43+
- PRI_had_eta
44+
- PRI_had_phi
45+
- PRI_met
46+
- PRI_met_phi
47+
- DER_mass_transverse_met_lep
48+
- DER_mass_vis
49+
- DER_pt_h
50+
- DER_deltar_had_lep
51+
- DER_pt_tot
52+
- DER_sum_pt
53+
- DER_pt_ratio_lep_had
54+
- DER_met_phi_centrality
55+
56+
one_jet:
57+
- PRI_jet_leading_pt
58+
- PRI_jet_leading_eta
59+
- PRI_jet_leading_phi
60+
- PRI_jet_all_pt
61+
62+
two_jets:
63+
- PRI_jet_subleading_pt
64+
- PRI_jet_subleading_eta
65+
- PRI_jet_subleading_phi
66+
- DER_deltaeta_jet_jet
67+
- DER_mass_jet_jet
68+
- DER_prodeta_jet_jet
69+
- DER_lep_eta_centrality
70+
71+
72+
n_jets:
73+
- PRI_n_jets
74+
Lines changed: 96 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,18 @@
1-
import os, sys, importlib
1+
import os, sys
22
import argparse
33
import numpy as np
44
import pandas as pd
55
import matplotlib.pyplot as plt
66
import mplhep as hep
77
import yaml
8-
import uproot
9-
10-
from utils import plot_kinematic_features
8+
import logging
9+
import warnings
1110

11+
# Add source path for utils
1212
sys.path.append('../src')
1313
import nsbi_common_utils
14-
from nsbi_common_utils import configuration
1514
from nsbi_common_utils import datasets
1615

17-
import logging
18-
import warnings
1916
# Suppress warnings
2017
warnings.simplefilter(action='ignore', category=FutureWarning)
2118

@@ -25,73 +22,107 @@
2522

2623
hep.style.use(hep.style.ATLAS)
2724

25+
def load_config(path):
26+
with open(path, "r") as f:
27+
return yaml.safe_load(f)
28+
2829
def parse_args():
29-
parser = argparse.ArgumentParser(description="Download and process HiggsML data for analysis.")
30+
parser = argparse.ArgumentParser(description="Process HiggsML data features.")
3031

3132
parser.add_argument(
3233
"--config",
3334
type=str,
34-
default='./config.yml',
35-
help="config file path"
35+
default="config.pipeline.yaml",
36+
help="Path to configuration file."
3637
)
3738

38-
3939
return parser.parse_args()
4040

41-
def ds_helper(cfg, branches):
41+
def ds_helper(cfg_path, branches):
4242
'''
4343
Uses nsbi_common_utils.datasets to load data.
4444
'''
45-
datasets_helper = nsbi_common_utils.datasets.datasets(config_path = cfg,
46-
branches_to_load = branches)
45+
datasets_helper = nsbi_common_utils.datasets.datasets(
46+
config_path=cfg_path,
47+
branches_to_load=branches
48+
)
4749
return datasets_helper
4850

4951
def process_data(df, input_features_by_jet, branches):
5052
"""Filters specific processes and balances the dataset."""
5153

5254
median_feature = {}
5355

54-
for sample, sample_dataset in df["Nominal"].items():
55-
56+
# 1. Calculate Medians from Nominal samples
57+
# Assuming df structure is Dict[Region, Dict[Sample, DataFrame]]
58+
# or Dict[Systematic, Dict[Sample, DataFrame]] based on nsbi loader
59+
60+
# We look for "Nominal" key usually used in nsbi utils
61+
if "Nominal" in df:
62+
nominal_data = df["Nominal"]
63+
else:
64+
# Fallback if the top level keys are not systematics but regions/samples directly
65+
# This depends on how load_systematics=True structures the output in nsbi_common_utils
66+
# Assuming standard structure:
67+
nominal_data = df.get("Nominal", df)
68+
69+
for sample, sample_dataset in nominal_data.items():
5670
median_feature[sample] = {}
5771

5872
for nJets, feat_list in input_features_by_jet.items():
5973
for feature in feat_list:
60-
61-
median_feature[sample][feature] = np.median(sample_dataset.loc[sample_dataset['PRI_n_jets'] >= nJets, feature])
62-
63-
logger.info(f"extracting additional branches from the engineered features")
74+
# Calculate median for valid jet counts
75+
vals = sample_dataset.loc[sample_dataset['PRI_n_jets'] >= nJets, feature]
76+
if len(vals) > 0:
77+
median_feature[sample][feature] = np.median(vals)
78+
else:
79+
median_feature[sample][feature] = 0.0
80+
81+
logger.info(f"Extracting additional branches from the engineered features")
6482
branches_to_add = []
6583

84+
# 2. Apply Engineering to all datasets (Systematics/Regions)
6685
for region, sample_datasets in df.items():
6786

6887
for sample, sample_dataset in sample_datasets.items():
6988

89+
# --- Categorical Jet Masks ---
7090
sample_dataset['njet_0'] = (sample_dataset['PRI_n_jets'] == 0).astype(int)
7191
sample_dataset['njet_1'] = (sample_dataset['PRI_n_jets'] == 1).astype(int)
7292
sample_dataset['njet_2'] = (sample_dataset['PRI_n_jets'] >= 2).astype(int)
7393

74-
branches_to_add += ['njet_0', 'njet_1', 'njet_2']
94+
for m in ['njet_0', 'njet_1', 'njet_2']:
95+
if m not in branches_to_add: branches_to_add.append(m)
7596

97+
# --- Per-Jet Masks and Imputation ---
7698
for i, feat_list in input_features_by_jet.items():
77-
mask_i = (sample_dataset['PRI_n_jets'] >= i).astype(float)
78-
sample_dataset[f'jet{i}_mask'] = mask_i
99+
# Create mask
100+
mask_col = f'jet{i}_mask'
101+
sample_dataset[mask_col] = (sample_dataset['PRI_n_jets'] >= i).astype(float)
79102

80-
branches_to_add += [f'jet{i}_mask']
103+
if mask_col not in branches_to_add: branches_to_add.append(mask_col)
81104

105+
# Impute
82106
for feat in feat_list:
83-
sample_dataset[feat] = sample_dataset[feat].where(sample_dataset['PRI_n_jets'] >= i, median_feature[sample][feat])
107+
# Use median from Nominal sample if available, else 0
108+
med_val = median_feature.get(sample, {}).get(feat, 0)
109+
sample_dataset[feat] = sample_dataset[feat].where(
110+
sample_dataset['PRI_n_jets'] >= i, med_val
111+
)
84112

85-
for feat in branches.copy():
113+
# --- Log Transformations ---
114+
for feat in branches:
115+
if feat not in sample_dataset.columns: continue
86116

87117
kin = sample_dataset[feat].to_numpy()
88-
89-
if (np.amin(kin) > 0.0) and (np.amax(kin)>100):
90-
log_feat = 'log_'+feat
91-
sample_dataset[log_feat] = np.log(kin+10.0)
118+
if len(kin) == 0: continue
119+
120+
if (np.amin(kin) > 0.0) and (np.amax(kin) > 100):
121+
log_feat = 'log_' + feat
122+
sample_dataset[log_feat] = np.log(kin + 10.0)
92123

93124
if log_feat not in branches_to_add:
94-
branches_to_add += [log_feat]
125+
branches_to_add.append(log_feat)
95126

96127
df[region][sample] = sample_dataset
97128

@@ -101,47 +132,48 @@ def process_data(df, input_features_by_jet, branches):
101132
def main():
102133
args = parse_args()
103134

104-
# Specify branches to load from the ROOT ntuples
105-
input_features_noJets = ['PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_had_pt', 'PRI_had_eta',
106-
'PRI_had_phi', 'PRI_met', 'PRI_met_phi', 'DER_mass_transverse_met_lep',
107-
'DER_mass_vis', 'DER_pt_h', 'DER_deltar_had_lep', 'DER_pt_tot', 'DER_sum_pt',
108-
'DER_pt_ratio_lep_had', 'DER_met_phi_centrality']
109-
110-
input_features_1Jets = ['PRI_jet_leading_pt', 'PRI_jet_leading_eta',
111-
'PRI_jet_leading_phi',
112-
'PRI_jet_all_pt']
113-
114-
input_features_2Jets = ['PRI_jet_subleading_pt',
115-
'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
116-
'DER_prodeta_jet_jet',
117-
'DER_lep_eta_centrality']
118-
119-
input_features_nJets = ['PRI_n_jets']
120-
121-
branches_to_load = input_features_noJets \
122-
+ input_features_1Jets \
123-
+ input_features_2Jets \
124-
+ input_features_nJets
135+
cfg_full = load_config(args.config)
136+
137+
if "data_preprocessing" not in cfg_full:
138+
raise KeyError("Config file missing 'data_preprocessing' section.")
139+
140+
feats = cfg_full["data_preprocessing"]["features"]
141+
142+
input_features_noJets = feats["no_jets"]
143+
input_features_1Jets = feats["one_jet"]
144+
input_features_2Jets = feats["two_jets"]
145+
input_features_nJets = feats["n_jets"]
146+
147+
branches_to_load = (input_features_noJets +
148+
input_features_1Jets +
149+
input_features_2Jets +
150+
input_features_nJets)
151+
125152
input_features_by_jet = {
126-
1 : input_features_1Jets,
127-
2 : input_features_2Jets
153+
1 : input_features_1Jets,
154+
2 : input_features_2Jets
128155
}
129156

130-
# Execution Flow
131157
try:
132158
logger.info(f"Loading and converting the dataset to Pandas DataFrame for processing...")
133-
datasets_helper = ds_helper(args.config, branches_to_load)
134-
datasets_all = datasets_helper.load_datasets_from_config(load_systematics = True)
159+
160+
datasets_helper = ds_helper(cfg_full['config_path'], branches_to_load)
161+
162+
datasets_all = datasets_helper.load_datasets_from_config(load_systematics=True)
135163

136-
datasets_all, add_branches = process_data(datasets_all, input_features_by_jet,
137-
branches=branches_to_load)
164+
datasets_all, add_branches = process_data(
165+
datasets_all,
166+
input_features_by_jet,
167+
branches=branches_to_load
168+
)
138169

139-
logger.info(f"adding additional branches to the DataFrame")
170+
logger.info(f"Adding additional branches to the DataFrame: {len(add_branches)} new features")
140171
datasets_helper.add_appended_branches(add_branches)
141172

142-
datasets_helper.save_datasets(datasets_all,
143-
save_systematics = True)
144-
173+
datasets_helper.save_datasets(
174+
datasets_all,
175+
save_systematics=True
176+
)
145177

146178
logger.info("Data Preprocessing workflow completed successfully.")
147179

@@ -150,4 +182,4 @@ def main():
150182
raise
151183

152184
if __name__ == "__main__":
153-
main()
185+
main()

0 commit comments

Comments
 (0)