Skip to content

[DF] Spawning TBB tasks within RDF event loop is racy #15079

Open
@martamaja10

Description

@martamaja10

Check duplicate issues.

  • Checked for duplicates

Description

Moved from JIRA: ROOT-10269

First reported on the forum here.

The issue has been mitigated in master and 6.18/02 by disabling parallel TTree::Fill (and therefore the primary cause of nested parallelism during an RDF event loop).
This does not protect from users spawning their own TBB tasks within an RDF event loop. The proper fix is to fully support nested parallelism in RDF, which requires more work. The goal is simply to make nested parallelism in RDF safe

This is a matter of correctness, even though typically users do not spawn TBB tasks from within the RDF event loop.

Reproducer

Reproducer without jitting, input files are here:

#include <ROOT/RDataFrame.hxx>
#include <TROOT.h>
#include <vector>
#include <string>

int main()
{
   ROOT::EnableImplicitMT();
   
   for (UInt_t i = 0 ; i < 100; i++){
   	std::vector<std::string> input_files;
   	input_files.emplace_back("test1.root");
   	input_files.emplace_back("test2.root");	
      std::vector<std::string> columnList =  {"mcChannelNumber",  "DeltaPhiJJ",  "DeltaPhiMetJJ",  "DeltaPhiMetJJ_corr",  "DeltaPhiMin3",  "DeltaRJJ",  "DeltaRJJ_corr",  "DeltaR_ratio",  "EleWeight",  "EleWeightTrig_e24_lhmedium_L1EM20VH_OR_e60_lhmedium_OR_e120_lhloose_OR_e26_lhtight_nod0_ivarloose_OR_e60_lhmedium_nod0_OR_e140_lhloose_nod0",  "Electron_charge",  "Electron_eta",  "Electron_m",  "Electron_passOR",  "Electron_phi",  "Electron_pt",  "Electron_signal",  "FatJet_eta",  "FatJet_m",  "FatJet_n_matchedasstrkjets",  "FatJet_phi",  "FatJet_pt",  "ForwardJet_eta",  "ForwardJet_m",  "ForwardJet_phi",  "ForwardJet_pt",  "GenWeight",  "GenWeightMCSampleMerging",  "HtRatioMerged",  "HtRatioResolved",  "IsMETTrigPassed",  "IsSingleElecTrigMatched",  "IsSingleElecTrigPassed",  "IsSingleMuonTrigMatched",  "IsSingleMuonTrigPassed",  "JetWeight",  "JetWeightBTag",  "JetWeightJVT",  "Jet_BLight1BPt",  "Jet_BLight1TruthLabel",  "Jet_BLight1isBjet",  "Jet_BLight2BPt",  "Jet_BLight2TruthLabel",  "Jet_BLight2isBjet",  "Jet_HadronConeExclTruthLabelID",  "Jet_MV2c10",  "Jet_bjet",  "Jet_eta",  "Jet_m",  "Jet_n_MuonInJet",  "Jet_passOR",  "Jet_phi",  "Jet_pt",  "MET_TriggerSF",  "METsig",  "MetTST_met",  "MetTST_phi",  "MetTST_sumet",  "MetTST_OverSqrtHT",  "MetTST_OverSqrtSumET",  "MetTST_Significance",  "MetTST_Significance_Rho",  "MetTST_Significance_VarL" ,  "MetTST_Significance_noPUJets_noSoftTerm",  "MetTST_Significance_noPUJets_noSoftTerm_Rho",  "MetTST_Significance_noPUJets_noSoftTerm_VarL",  "MetTST_Significance_noPUJets_noSoftTerm_muInvis",  "MetTST_Significance_noPUJets_noSoftTerm_muInvis_Rho",  "MetTST_Significance_noPUJets_noSoftTerm_muInvis_VarL",  "MetTSTmuInvis_met",  "MetTSTmuInvis_phi",  "MetTSTmuInvis_sumet",  "MetTrack_met",  "MetTrack_phi",  "MetTrack_sumet",  "MuoWeight",  "MuoWeightTrigHLT_mu20_iloose_L1MU15_OR_HLT_mu40",  "MuoWeightTrigHLT_mu24_ivarmedium_OR_HLT_mu40",  "MuoWeightTrigHLT_mu24_ivarmedium_OR_HLT_mu50",  "MuoWeightTrigHLT_mu26_ivarmedium_OR_HLT_mu50",  "Muon_charge",  "Muon_eta",  "Muon_m",  "Muon_passOR",  "Muon_phi",  "Muon_pt",  "Muon_signal",  "N_BJets_04",  "N_BTags_associated_02",  "N_BTags_not_associated_02",  "N_BaselineElectrons",  "N_BaselineMuons",  "N_ForwardJets04",  "N_Jets04",  "N_Jets10",  "N_SignalElectrons",  "N_SignalMuons",  "N_SignalTaus",  "N_TausExtended_Merged",  "N_TausExtended_Resolved",  "N_associated_Jets02",  "N_not_associated_Jets02",  "N_not_associated_Taus",  "RandomLumiBlockNumber",  "RandomRunNumber",  "SUSYFinalState",  "TauWeight",  "TrackJetWeight",  "TrackJet_1TruthLabel",  "TrackJet_1isBjet",  "TrackJet_1passOR",  "TrackJet_2TruthLabel",  "TrackJet_2isBjet",  "TrackJet_2passOR",  "TrackJet_HadronConeExclTruthLabelID",  "TrackJet_MV2c10",  "TrackJet_bjet",  "TrackJet_eta",  "TrackJet_isAssociated",  "TrackJet_m",  "TrackJet_passDRcut",  "TrackJet_phi",  "TrackJet_pt",  "TrigHLT_e120_lhloose",  "TrigHLT_e140_lhloose_nod0",  "TrigHLT_e24_lhmedium_L1EM20VH",  "TrigHLT_e24_lhtight_nod0_ivarloose",  "TrigHLT_e26_lhtight_nod0_ivarloose",  "TrigHLT_e300_etcut",  "TrigHLT_e60_lhmedium",  "TrigHLT_e60_lhmedium_nod0",  "TrigHLT_e60_medium",  "TrigHLT_mu20_iloose_L1MU15",  "TrigHLT_mu24_iloose",  "TrigHLT_mu24_iloose_L1MU15",  "TrigHLT_mu24_imedium",  "TrigHLT_mu24_ivarloose",  "TrigHLT_mu24_ivarloose_L1MU15",  "TrigHLT_mu24_ivarmedium",  "TrigHLT_mu26_imedium",  "TrigHLT_mu26_ivarmedium",  "TrigHLT_mu40",  "TrigHLT_mu50",  "TrigHLT_mu60_0eta105_msonly",  "TrigHLT_xe110_mht_L1XE50",  "TrigHLT_xe110_pufit_L1XE50",  "TrigHLT_xe110_pufit_L1XE55",  "TrigHLT_xe110_pufit_xe70_L1XE50",  "TrigHLT_xe70_mht",  "TrigHLT_xe90_mht_L1XE50",  "TrigMatchHLT_e120_lhloose",  "TrigMatchHLT_e140_lhloose_nod0",  "TrigMatchHLT_e24_lhmedium_L1EM20VH",  "TrigMatchHLT_e24_lhtight_nod0_ivarloose",  "TrigMatchHLT_e26_lhtight_nod0_ivarloose",  "TrigMatchHLT_e300_etcut",  "TrigMatchHLT_e60_lhmedium",  "TrigMatchHLT_e60_lhmedium_nod0",  "TrigMatchHLT_e60_medium",  "TrigMatchHLT_mu20_iloose_L1MU15",  "TrigMatchHLT_mu24_iloose",  "TrigMatchHLT_mu24_iloose_L1MU15",  "TrigMatchHLT_mu24_imedium",  "TrigMatchHLT_mu24_ivarloose",  "TrigMatchHLT_mu24_ivarloose_L1MU15",  "TrigMatchHLT_mu24_ivarmedium",  "TrigMatchHLT_mu26_imedium",  "TrigMatchHLT_mu26_ivarmedium",  "TrigMatchHLT_mu40",  "TrigMatchHLT_mu50",  "TrigMatchHLT_mu60_0eta105_msonly",  "TrigMatching",  "TruthMET_met",  "TruthMET_phi",  "TruthMET_sumet",  "Vtx_n",  "XbbScoreHiggs",  "XbbScoreQCD",  "XbbScoreTop",  "actualInteractionsPerCrossing",  "averageInteractionsPerCrossing",  "bcid",  "corr_avgIntPerX",  "eventNumber",  "isOppositeCharge",  "lumiBlock",  "mT_METclosestBJet",  "mT_METfarestBJet",  "m_J",  "m_jj",  "m_jj_corr",  "m_ll",  "muWeight",  "mu_density",  "pt_ll",  "runNumber",  "sigjet012ptsum"};

   	ROOT::RDataFrame("MonoH_Nominal",input_files).
         Snapshot<Int_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Double_t, Double_t,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<int>,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<float>, Double_t, Double_t, Float_t, Float_t, Char_t, Char_t, Char_t, Char_t, Char_t, Double_t, Double_t, Double_t, Float_t, Int_t, Bool_t, Float_t, Int_t, Bool_t,std::vector<int>,std::vector<double>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<int>,std::vector<char>,std::vector<float>,std::vector<float>, Double_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Double_t, Double_t, Double_t, Double_t, Double_t,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<char>, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, UInt_t, UInt_t, Int_t, Double_t, Double_t, Int_t, Bool_t, Bool_t, Int_t, Bool_t, Bool_t,std::vector<int>,std::vector<double>,std::vector<char>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Float_t, Float_t, Float_t, Int_t, Float_t, Float_t, Float_t, Float_t, Float_t, UInt_t, Float_t, ULong64_t, Bool_t, UInt_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Double_t, Float_t, Float_t, UInt_t, Float_t>
            ("test", "test_out.root", columnList)
         .GetValue();
   }
}

Looking at the stacktraces it's clear that the problem is interleaved execution of RDataFrame TBB tasks (a RDF task starts and take a slot number, then ROOT::Internal::TBranchIMTHelper::Wait triggers the start of an inner RDF task that takes a second slot number without returning the first one, leading to an error in builds with asserts enabled, and undefined behavior in builds without asserts). Attached the stacktrace of the relevant thread (not the one that errors out, but the one that steals the last slot number so another thread doesn't find it and errors out).

ROOT version

any

Installation method

any

Operating system

any

Additional context

No response

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions