Skip to content

Commit a9ad8c5

Browse files
Valerii Kholoimovvepadulano
authored andcommitted
Introduce statistical inference step in CMS ttbar analysis
And also add a github action to validate results of the analysis and fitting
1 parent 7073818 commit a9ad8c5

File tree

20 files changed

+2193
-68
lines changed

20 files changed

+2193
-68
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
name: Validate CMS Open Data ttbar analysis
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
branches:
9+
- main
10+
11+
jobs:
12+
run-cms-open-data-ttbar-analysis:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@v3
18+
19+
- name: Set up ROOT environment
20+
run: |
21+
sudo apt-get update
22+
sudo apt-get install -y dpkg-dev cmake g++ gcc binutils libx11-dev libncurses5-dev libssl-dev libxpm-dev \
23+
libxft-dev libxml2-dev libz-dev libxext-dev python3-dev git libtbb-dev libgif-dev xrootd-client python3
24+
pip install numpy plotting distributed tqdm uproot
25+
wget https://root.cern/download/root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
26+
tar -xzvf root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
27+
source root/bin/thisroot.sh
28+
echo "ROOT is set up"
29+
30+
- name: Run Analysis
31+
run: |
32+
source root/bin/thisroot.sh
33+
cd analyses/cms-open-data-ttbar/
34+
./validate | tee output.txt
35+
36+
- name: Compare histograms validation output with expected
37+
id: histograms
38+
run: |
39+
cd analyses/cms-open-data-ttbar/
40+
if grep -q "Test failed: Histograms validation output does not match expected result." output.txt; then
41+
echo "Histograms validation failed."
42+
echo "RESULT_HISTOGRAMS=fail" >> $GITHUB_ENV
43+
exit 1
44+
else
45+
echo "Histograms validation passed."
46+
echo "RESULT_HISTOGRAMS=pass" >> $GITHUB_ENV
47+
fi
48+
49+
- name: Run validation sequences for fitResults
50+
id: fitresults
51+
run: |
52+
cd analyses/cms-open-data-ttbar/
53+
if grep -q "Test failed: fitResults validation output does not match expected result." output.txt; then
54+
echo "fitResults validation failed."
55+
echo "RESULT_FITRESULTS=fail" >> $GITHUB_ENV
56+
exit 1
57+
else
58+
echo "fitResults validation passed."
59+
echo "RESULT_FITRESULTS=pass" >> $GITHUB_ENV
60+
fi
61+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
2+
All good!

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@
77
data.json
88
__pycache__
99
tags
10+
analyses/cms-open-data-ttbar/*.root
11+
analyses/cms-open-data-ttbar/*.table
12+
analyses/cms-open-data-ttbar/statistical_data/*

analyses/cms-open-data-ttbar/analysis.py

Lines changed: 67 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,12 @@
44
from time import time
55
from typing import Tuple
66

7-
from distributed import Client, get_worker, LocalCluster, SSHCluster
87
import ml
9-
from plotting import save_ml_plots, save_plots
108
import ROOT
11-
from utils import (
12-
AGCInput,
13-
AGCResult,
14-
postprocess_results,
15-
retrieve_inputs,
16-
save_histos,
17-
)
9+
from distributed import Client, LocalCluster, SSHCluster, get_worker
10+
from plotting import save_ml_plots, save_plots
11+
from statistical import fit_histograms
12+
from utils import AGCInput, AGCResult, postprocess_results, retrieve_inputs, save_histos
1813

1914
# Using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data
2015
# as a reference. Values are in pb.
@@ -90,7 +85,24 @@ def parse_args() -> argparse.Namespace:
9085
"--hosts",
9186
help="A comma-separated list of worker node hostnames. Only required if --scheduler=dask-ssh, ignored otherwise.",
9287
)
93-
p.add_argument("-v", "--verbose", help="Turn on verbose execution logs.", action="store_true")
88+
p.add_argument(
89+
"-v",
90+
"--verbose",
91+
help="Turn on verbose execution logs.",
92+
action="store_true",
93+
)
94+
95+
p.add_argument(
96+
"--statistical-validation",
97+
help = argparse.SUPPRESS,
98+
action="store_true",
99+
)
100+
101+
p.add_argument(
102+
"--no-fitting",
103+
help="Do not run statistical validation part of the analysis.",
104+
action="store_true",
105+
)
94106

95107
return p.parse_args()
96108

@@ -109,7 +121,11 @@ def create_dask_client(scheduler: str, ncores: int, hosts: str, scheduler_addres
109121
sshc = SSHCluster(
110122
workers,
111123
connect_options={"known_hosts": None},
112-
worker_options={"nprocs": ncores, "nthreads": 1, "memory_limit": "32GB"},
124+
worker_options={
125+
"nprocs": ncores,
126+
"nthreads": 1,
127+
"memory_limit": "32GB",
128+
},
113129
)
114130
return Client(sshc)
115131

@@ -128,7 +144,10 @@ def define_trijet_mass(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
128144
df = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) > 1")
129145

130146
# Build four-momentum vectors for each jet
131-
df = df.Define("Jet_p4", "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)")
147+
df = df.Define(
148+
"Jet_p4",
149+
"ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)",
150+
)
132151

133152
# Build trijet combinations
134153
df = df.Define("Trijet_idx", "Combinations(Jet_pt_cut, 3)")
@@ -186,7 +205,7 @@ def book_histos(
186205
# pt_res_up(jet_pt) - jet resolution systematic
187206
df = df.Vary(
188207
"Jet_pt",
189-
"ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt.size())}",
208+
"ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt)}",
190209
["pt_scale_up", "pt_res_up"],
191210
)
192211

@@ -240,8 +259,7 @@ def book_histos(
240259
# Only one b-tagged region required
241260
# The observable is the total transvesre momentum
242261
# fmt: off
243-
df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1")\
244-
.Define("HT", "Sum(Jet_pt_cut)")
262+
df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1").Define("HT", "Sum(Jet_pt_cut)")
245263
# fmt: on
246264

247265
# Define trijet_mass observable for the 4j2b region (this one is more complicated)
@@ -251,20 +269,34 @@ def book_histos(
251269
results = []
252270
for df, observable, region in zip([df4j1b, df4j2b], ["HT", "Trijet_mass"], ["4j1b", "4j2b"]):
253271
histo_model = ROOT.RDF.TH1DModel(
254-
name=f"{region}_{process}_{variation}", title=process, nbinsx=25, xlow=50, xup=550
272+
name=f"{region}_{process}_{variation}",
273+
title=process,
274+
nbinsx=25,
275+
xlow=50,
276+
xup=550,
255277
)
256278
nominal_histo = df.Histo1D(histo_model, observable, "Weights")
257279

258280
if variation == "nominal":
259281
results.append(
260282
AGCResult(
261-
nominal_histo, region, process, variation, nominal_histo, should_vary=True
283+
nominal_histo,
284+
region,
285+
process,
286+
variation,
287+
nominal_histo,
288+
should_vary=True,
262289
)
263290
)
264291
else:
265292
results.append(
266293
AGCResult(
267-
nominal_histo, region, process, variation, nominal_histo, should_vary=False
294+
nominal_histo,
295+
region,
296+
process,
297+
variation,
298+
nominal_histo,
299+
should_vary=False,
268300
)
269301
)
270302
print(f"Booked histogram {histo_model.fName}")
@@ -292,7 +324,12 @@ def book_histos(
292324
if variation == "nominal":
293325
ml_results.append(
294326
AGCResult(
295-
nominal_histo, feature.name, process, variation, nominal_histo, should_vary=True
327+
nominal_histo,
328+
feature.name,
329+
process,
330+
variation,
331+
nominal_histo,
332+
should_vary=True,
296333
)
297334
)
298335
else:
@@ -382,7 +419,10 @@ def ml_init():
382419
with create_dask_client(args.scheduler, args.ncores, args.hosts, scheduler_address) as client:
383420
for input in inputs:
384421
df = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame(
385-
"Events", input.paths, daskclient=client, npartitions=args.npartitions
422+
"Events",
423+
input.paths,
424+
daskclient=client,
425+
npartitions=args.npartitions,
386426
)
387427
df._headnode.backend.distribute_unique_paths(
388428
[
@@ -426,6 +466,10 @@ def main() -> None:
426466
# To only change the verbosity in a given scope, use ROOT.Experimental.RLogScopedVerbosity.
427467
ROOT.Detail.RDF.RDFLogChannel().SetVerbosity(ROOT.Experimental.ELogLevel.kInfo)
428468

469+
if args.statistical_validation:
470+
fit_histograms(filename=args.output)
471+
return
472+
429473
inputs: list[AGCInput] = retrieve_inputs(
430474
args.n_max_files_per_sample, args.remote_data_prefix, args.data_cache
431475
)
@@ -457,6 +501,9 @@ def main() -> None:
457501
save_histos([r.histo for r in ml_results], output_fname=output_fname)
458502
print(f"Result histograms from ML inference step saved in file {output_fname}")
459503

504+
if not args.no_fitting:
505+
fit_histograms(filename=args.output)
506+
460507

461508
if __name__ == "__main__":
462509
main()

analyses/cms-open-data-ttbar/helpers.h

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,18 @@
1010
#include "TRandom3.h"
1111
#include <Math/Vector4D.h>
1212

13-
// functions creating systematic variations
14-
inline double random_gaus()
15-
{
16-
thread_local std::random_device rd{};
17-
thread_local std::mt19937 gen{rd()};
18-
thread_local std::normal_distribution<double> d{1, 0.05};
19-
return d(gen);
20-
}
21-
22-
inline ROOT::RVecF jet_pt_resolution(std::size_t size)
13+
inline ROOT::RVecF jet_pt_resolution(const ROOT::RVecF &jet_pt)
2314
{
2415
// normal distribution with 5% variations, shape matches jets
25-
ROOT::RVecF res(size);
26-
std::generate(std::begin(res), std::end(res), []()
27-
{ return random_gaus(); });
16+
ROOT::RVecF res(jet_pt.size());
17+
// We need to create some pseudo-randomness, it should be thread-safe and at the same time do not depend on RNG. We use the fact that [jet_pt is in GeV....].
18+
// We then use the gaussian quantile to compute the resolution according to the input mean and sigma, using the random bits from the floating-point values.
19+
double mean = 1.;
20+
double sigma = 0.05;
21+
for (std::size_t i = 0; i < jet_pt.size(); ++i) {
22+
res[i] = mean + ROOT::Math::gaussian_quantile(static_cast<double>(0.001 * (static_cast<int>(jet_pt[i] * 1000) % 1000)) + 0.0005, sigma);
23+
}
24+
2825
return res;
2926
}
3027

analyses/cms-open-data-ttbar/ml.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import Tuple
44

55
import ROOT
6-
76
from distributed import get_worker
87

98
# histogram bin lower limit to use for each ML input feature
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
2+
All good!
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)