Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .github/workflows/fitting_check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Validate CMS Open Data ttbar analysis

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
run-cms-open-data-ttbar-analysis:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set up ROOT environment
run: |
sudo apt-get update
sudo apt-get install -y dpkg-dev cmake g++ gcc binutils libx11-dev libncurses5-dev libssl-dev libxpm-dev \
libxft-dev libxml2-dev libz-dev libxext-dev python3-dev git libtbb-dev libgif-dev xrootd-client python3
pip install numpy plotting distributed tqdm uproot
wget https://root.cern/download/root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
tar -xzvf root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
source root/bin/thisroot.sh
echo "ROOT is set up"
- name: Run Analysis
run: |
source root/bin/thisroot.sh
cd analyses/cms-open-data-ttbar/
./validate | tee output.txt
- name: Compare histograms validation output with expected
id: histograms
run: |
cd analyses/cms-open-data-ttbar/
if grep -q "Test failed: Histograms validation output does not match expected result." output.txt; then
echo "Histograms validation failed."
echo "RESULT_HISTOGRAMS=fail" >> $GITHUB_ENV
exit 1
else
echo "Histograms validation passed."
echo "RESULT_HISTOGRAMS=pass" >> $GITHUB_ENV
fi
- name: Run validation sequences for fitResults
id: fitresults
run: |
cd analyses/cms-open-data-ttbar/
if grep -q "Test failed: fitResults validation output does not match expected result." output.txt; then
echo "fitResults validation failed."
echo "RESULT_FITRESULTS=fail" >> $GITHUB_ENV
exit 1
else
echo "fitResults validation passed."
echo "RESULT_FITRESULTS=pass" >> $GITHUB_ENV
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
All good!
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
data.json
__pycache__
tags
analyses/cms-open-data-ttbar/*.root
analyses/cms-open-data-ttbar/*.table
analyses/cms-open-data-ttbar/statistical_data/*
87 changes: 67 additions & 20 deletions analyses/cms-open-data-ttbar/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,12 @@
from time import time
from typing import Tuple

from distributed import Client, get_worker, LocalCluster, SSHCluster
import ml
from plotting import save_ml_plots, save_plots
import ROOT
from utils import (
AGCInput,
AGCResult,
postprocess_results,
retrieve_inputs,
save_histos,
)
from distributed import Client, LocalCluster, SSHCluster, get_worker
from plotting import save_ml_plots, save_plots
from statistical import fit_histograms
from utils import AGCInput, AGCResult, postprocess_results, retrieve_inputs, save_histos

# Using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data
# as a reference. Values are in pb.
Expand Down Expand Up @@ -90,7 +85,24 @@ def parse_args() -> argparse.Namespace:
"--hosts",
help="A comma-separated list of worker node hostnames. Only required if --scheduler=dask-ssh, ignored otherwise.",
)
p.add_argument("-v", "--verbose", help="Turn on verbose execution logs.", action="store_true")
p.add_argument(
"-v",
"--verbose",
help="Turn on verbose execution logs.",
action="store_true",
)

p.add_argument(
"--statistical-validation",
help = argparse.SUPPRESS,
action="store_true",
)

p.add_argument(
"--no-fitting",
help="Do not run statistical validation part of the analysis.",
action="store_true",
)

return p.parse_args()

Expand All @@ -109,7 +121,11 @@ def create_dask_client(scheduler: str, ncores: int, hosts: str, scheduler_addres
sshc = SSHCluster(
workers,
connect_options={"known_hosts": None},
worker_options={"nprocs": ncores, "nthreads": 1, "memory_limit": "32GB"},
worker_options={
"nprocs": ncores,
"nthreads": 1,
"memory_limit": "32GB",
},
)
return Client(sshc)

Expand All @@ -128,7 +144,10 @@ def define_trijet_mass(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
df = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) > 1")

# Build four-momentum vectors for each jet
df = df.Define("Jet_p4", "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)")
df = df.Define(
"Jet_p4",
"ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)",
)

# Build trijet combinations
df = df.Define("Trijet_idx", "Combinations(Jet_pt_cut, 3)")
Expand Down Expand Up @@ -186,7 +205,7 @@ def book_histos(
# pt_res_up(jet_pt) - jet resolution systematic
df = df.Vary(
"Jet_pt",
"ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt.size())}",
"ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt)}",
["pt_scale_up", "pt_res_up"],
)

Expand Down Expand Up @@ -240,8 +259,7 @@ def book_histos(
# Only one b-tagged region required
# The observable is the total transvesre momentum
# fmt: off
df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1")\
.Define("HT", "Sum(Jet_pt_cut)")
df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1").Define("HT", "Sum(Jet_pt_cut)")
# fmt: on

# Define trijet_mass observable for the 4j2b region (this one is more complicated)
Expand All @@ -251,20 +269,34 @@ def book_histos(
results = []
for df, observable, region in zip([df4j1b, df4j2b], ["HT", "Trijet_mass"], ["4j1b", "4j2b"]):
histo_model = ROOT.RDF.TH1DModel(
name=f"{region}_{process}_{variation}", title=process, nbinsx=25, xlow=50, xup=550
name=f"{region}_{process}_{variation}",
title=process,
nbinsx=25,
xlow=50,
xup=550,
)
nominal_histo = df.Histo1D(histo_model, observable, "Weights")

if variation == "nominal":
results.append(
AGCResult(
nominal_histo, region, process, variation, nominal_histo, should_vary=True
nominal_histo,
region,
process,
variation,
nominal_histo,
should_vary=True,
)
)
else:
results.append(
AGCResult(
nominal_histo, region, process, variation, nominal_histo, should_vary=False
nominal_histo,
region,
process,
variation,
nominal_histo,
should_vary=False,
)
)
print(f"Booked histogram {histo_model.fName}")
Expand Down Expand Up @@ -292,7 +324,12 @@ def book_histos(
if variation == "nominal":
ml_results.append(
AGCResult(
nominal_histo, feature.name, process, variation, nominal_histo, should_vary=True
nominal_histo,
feature.name,
process,
variation,
nominal_histo,
should_vary=True,
)
)
else:
Expand Down Expand Up @@ -382,7 +419,10 @@ def ml_init():
with create_dask_client(args.scheduler, args.ncores, args.hosts, scheduler_address) as client:
for input in inputs:
df = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame(
"Events", input.paths, daskclient=client, npartitions=args.npartitions
"Events",
input.paths,
daskclient=client,
npartitions=args.npartitions,
)
df._headnode.backend.distribute_unique_paths(
[
Expand Down Expand Up @@ -426,6 +466,10 @@ def main() -> None:
# To only change the verbosity in a given scope, use ROOT.Experimental.RLogScopedVerbosity.
ROOT.Detail.RDF.RDFLogChannel().SetVerbosity(ROOT.Experimental.ELogLevel.kInfo)

if args.statistical_validation:
fit_histograms(filename=args.output)
return

inputs: list[AGCInput] = retrieve_inputs(
args.n_max_files_per_sample, args.remote_data_prefix, args.data_cache
)
Expand Down Expand Up @@ -457,6 +501,9 @@ def main() -> None:
save_histos([r.histo for r in ml_results], output_fname=output_fname)
print(f"Result histograms from ML inference step saved in file {output_fname}")

if not args.no_fitting:
fit_histograms(filename=args.output)


if __name__ == "__main__":
main()
23 changes: 10 additions & 13 deletions analyses/cms-open-data-ttbar/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,18 @@
#include "TRandom3.h"
#include <Math/Vector4D.h>

// functions creating systematic variations
inline double random_gaus()
{
thread_local std::random_device rd{};
thread_local std::mt19937 gen{rd()};
thread_local std::normal_distribution<double> d{1, 0.05};
return d(gen);
}

inline ROOT::RVecF jet_pt_resolution(std::size_t size)
inline ROOT::RVecF jet_pt_resolution(const ROOT::RVecF &jet_pt)
{
// normal distribution with 5% variations, shape matches jets
ROOT::RVecF res(size);
std::generate(std::begin(res), std::end(res), []()
{ return random_gaus(); });
ROOT::RVecF res(jet_pt.size());
// We need to create some pseudo-randomness, it should be thread-safe and at the same time do not depend on RNG. We use the fact that [jet_pt is in GeV....].
// We then use the gaussian quantile to compute the resolution according to the input mean and sigma, using the random bits from the floating-point values.
double mean = 1.;
double sigma = 0.05;
for (std::size_t i = 0; i < jet_pt.size(); ++i) {
res[i] = mean + ROOT::Math::gaussian_quantile(static_cast<double>(0.001 * (static_cast<int>(jet_pt[i] * 1000) % 1000)) + 0.0005, sigma);
}

return res;
}

Expand Down
1 change: 0 additions & 1 deletion analyses/cms-open-data-ttbar/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Tuple

import ROOT

from distributed import get_worker

# histogram bin lower limit to use for each ML input feature
Expand Down
2 changes: 2 additions & 0 deletions analyses/cms-open-data-ttbar/output_text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
All good!
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import argparse

import ROOT

# Create an argument parser
parser = argparse.ArgumentParser(description="Run the fitting part of the analysis.")

# Add argument for the first parameter (n-max-files-per-sample)
parser.add_argument('--n-files-per-sample', type=int, required=True, help="Maximum number of files per sample.")


def get_fit_result(file_path, fit_result_name):
"""Open the ROOT file and retrieve the RooFitResult object."""
file = ROOT.TFile(file_path)
fit_result = file.Get(fit_result_name)
if not fit_result:
raise ValueError(
f"Fit result '{fit_result_name}' not found in {file_path}"
)
return fit_result


def compare_fit_results(result1, result2):
"""Compare the parameter values of two RooFitResults."""
params1 = result1.floatParsFinal()
params2 = result2.floatParsFinal()

# Check for the same number of parameters
if params1.getSize() != params2.getSize():
print(
f"Number of parameters differ: {params1.getSize()} != {params2.getSize()}"
)
return

print("Comparing parameters...")

ERROR = False

# Loop over parameters in the first result and compare with the second
for i in range(params1.getSize()):
par1 = params1[i]
par2 = params2.find(
par1.GetName()
) # Find corresponding parameter by name in result2

if not par2:
print(
f"Parameter '{par1.GetName()}' not found in the second fit result."
)
ERROR = True
continue

# Compare values and print differences
if abs(par1.getVal() - par2.getVal()) < 1e-6:
print(f"Parameter '{par1.GetName()}' matches: {par1.getVal()}")
else:
print(
f"Parameter '{par1.GetName()}' differs: {par1.getVal()} != {par2.getVal()}"
)
ERROR = True

# Optionally compare errors too
if abs(par1.getError() - par2.getError()) > 1e-6:
print(
f"Parameter '{par1.GetName()}' error differs: {par1.getError()} != {par2.getError()}"
)
ERROR = True

if ERROR:
print("ERROR: Comparison failed.")


args = parser.parse_args()

number_of_files = args.n_files_per_sample

# Replace these with the paths to your .root files and fit result names
file1 = "./fitResults.root"
file2 = f"./reference/fitResults/fitResults_{number_of_files}_file.root"
fit_result_name_1 = "fitResult" # Fit result in first file
fit_result_name_2 = "fitResult" # Fit result in second file

# Load the fit results from the two files
fit_result_1 = get_fit_result(file1, fit_result_name_1)
fit_result_2 = get_fit_result(file2, fit_result_name_2)

# Compare the fit results
compare_fit_results(fit_result_1, fit_result_2)
Loading
Loading