Description
Check duplicate issues.
- Checked for duplicates
Description
Moved from JIRA: ROOT-10269
First reported on the forum here.
The issue has been mitigated in master and 6.18/02 by disabling parallel TTree::Fill (and therefore the primary cause of nested parallelism during an RDF event loop).
This does not protect from users spawning their own TBB tasks within an RDF event loop. The proper fix is to fully support nested parallelism in RDF, which requires more work. The goal is simply to make nested parallelism in RDF safe
This is a matter of correctness, even though typically users do not spawn TBB tasks from within the RDF event loop.
Reproducer
Reproducer without jitting, input files are here:
#include <ROOT/RDataFrame.hxx>
#include <TROOT.h>
#include <vector>
#include <string>
int main()
{
ROOT::EnableImplicitMT();
for (UInt_t i = 0 ; i < 100; i++){
std::vector<std::string> input_files;
input_files.emplace_back("test1.root");
input_files.emplace_back("test2.root");
std::vector<std::string> columnList = {"mcChannelNumber", "DeltaPhiJJ", "DeltaPhiMetJJ", "DeltaPhiMetJJ_corr", "DeltaPhiMin3", "DeltaRJJ", "DeltaRJJ_corr", "DeltaR_ratio", "EleWeight", "EleWeightTrig_e24_lhmedium_L1EM20VH_OR_e60_lhmedium_OR_e120_lhloose_OR_e26_lhtight_nod0_ivarloose_OR_e60_lhmedium_nod0_OR_e140_lhloose_nod0", "Electron_charge", "Electron_eta", "Electron_m", "Electron_passOR", "Electron_phi", "Electron_pt", "Electron_signal", "FatJet_eta", "FatJet_m", "FatJet_n_matchedasstrkjets", "FatJet_phi", "FatJet_pt", "ForwardJet_eta", "ForwardJet_m", "ForwardJet_phi", "ForwardJet_pt", "GenWeight", "GenWeightMCSampleMerging", "HtRatioMerged", "HtRatioResolved", "IsMETTrigPassed", "IsSingleElecTrigMatched", "IsSingleElecTrigPassed", "IsSingleMuonTrigMatched", "IsSingleMuonTrigPassed", "JetWeight", "JetWeightBTag", "JetWeightJVT", "Jet_BLight1BPt", "Jet_BLight1TruthLabel", "Jet_BLight1isBjet", "Jet_BLight2BPt", "Jet_BLight2TruthLabel", "Jet_BLight2isBjet", "Jet_HadronConeExclTruthLabelID", "Jet_MV2c10", "Jet_bjet", "Jet_eta", "Jet_m", "Jet_n_MuonInJet", "Jet_passOR", "Jet_phi", "Jet_pt", "MET_TriggerSF", "METsig", "MetTST_met", "MetTST_phi", "MetTST_sumet", "MetTST_OverSqrtHT", "MetTST_OverSqrtSumET", "MetTST_Significance", "MetTST_Significance_Rho", "MetTST_Significance_VarL" , "MetTST_Significance_noPUJets_noSoftTerm", "MetTST_Significance_noPUJets_noSoftTerm_Rho", "MetTST_Significance_noPUJets_noSoftTerm_VarL", "MetTST_Significance_noPUJets_noSoftTerm_muInvis", "MetTST_Significance_noPUJets_noSoftTerm_muInvis_Rho", "MetTST_Significance_noPUJets_noSoftTerm_muInvis_VarL", "MetTSTmuInvis_met", "MetTSTmuInvis_phi", "MetTSTmuInvis_sumet", "MetTrack_met", "MetTrack_phi", "MetTrack_sumet", "MuoWeight", "MuoWeightTrigHLT_mu20_iloose_L1MU15_OR_HLT_mu40", "MuoWeightTrigHLT_mu24_ivarmedium_OR_HLT_mu40", "MuoWeightTrigHLT_mu24_ivarmedium_OR_HLT_mu50", "MuoWeightTrigHLT_mu26_ivarmedium_OR_HLT_mu50", "Muon_charge", "Muon_eta", "Muon_m", "Muon_passOR", "Muon_phi", "Muon_pt", "Muon_signal", "N_BJets_04", "N_BTags_associated_02", "N_BTags_not_associated_02", "N_BaselineElectrons", "N_BaselineMuons", "N_ForwardJets04", "N_Jets04", "N_Jets10", "N_SignalElectrons", "N_SignalMuons", "N_SignalTaus", "N_TausExtended_Merged", "N_TausExtended_Resolved", "N_associated_Jets02", "N_not_associated_Jets02", "N_not_associated_Taus", "RandomLumiBlockNumber", "RandomRunNumber", "SUSYFinalState", "TauWeight", "TrackJetWeight", "TrackJet_1TruthLabel", "TrackJet_1isBjet", "TrackJet_1passOR", "TrackJet_2TruthLabel", "TrackJet_2isBjet", "TrackJet_2passOR", "TrackJet_HadronConeExclTruthLabelID", "TrackJet_MV2c10", "TrackJet_bjet", "TrackJet_eta", "TrackJet_isAssociated", "TrackJet_m", "TrackJet_passDRcut", "TrackJet_phi", "TrackJet_pt", "TrigHLT_e120_lhloose", "TrigHLT_e140_lhloose_nod0", "TrigHLT_e24_lhmedium_L1EM20VH", "TrigHLT_e24_lhtight_nod0_ivarloose", "TrigHLT_e26_lhtight_nod0_ivarloose", "TrigHLT_e300_etcut", "TrigHLT_e60_lhmedium", "TrigHLT_e60_lhmedium_nod0", "TrigHLT_e60_medium", "TrigHLT_mu20_iloose_L1MU15", "TrigHLT_mu24_iloose", "TrigHLT_mu24_iloose_L1MU15", "TrigHLT_mu24_imedium", "TrigHLT_mu24_ivarloose", "TrigHLT_mu24_ivarloose_L1MU15", "TrigHLT_mu24_ivarmedium", "TrigHLT_mu26_imedium", "TrigHLT_mu26_ivarmedium", "TrigHLT_mu40", "TrigHLT_mu50", "TrigHLT_mu60_0eta105_msonly", "TrigHLT_xe110_mht_L1XE50", "TrigHLT_xe110_pufit_L1XE50", "TrigHLT_xe110_pufit_L1XE55", "TrigHLT_xe110_pufit_xe70_L1XE50", "TrigHLT_xe70_mht", "TrigHLT_xe90_mht_L1XE50", "TrigMatchHLT_e120_lhloose", "TrigMatchHLT_e140_lhloose_nod0", "TrigMatchHLT_e24_lhmedium_L1EM20VH", "TrigMatchHLT_e24_lhtight_nod0_ivarloose", "TrigMatchHLT_e26_lhtight_nod0_ivarloose", "TrigMatchHLT_e300_etcut", "TrigMatchHLT_e60_lhmedium", "TrigMatchHLT_e60_lhmedium_nod0", "TrigMatchHLT_e60_medium", "TrigMatchHLT_mu20_iloose_L1MU15", "TrigMatchHLT_mu24_iloose", "TrigMatchHLT_mu24_iloose_L1MU15", "TrigMatchHLT_mu24_imedium", "TrigMatchHLT_mu24_ivarloose", "TrigMatchHLT_mu24_ivarloose_L1MU15", "TrigMatchHLT_mu24_ivarmedium", "TrigMatchHLT_mu26_imedium", "TrigMatchHLT_mu26_ivarmedium", "TrigMatchHLT_mu40", "TrigMatchHLT_mu50", "TrigMatchHLT_mu60_0eta105_msonly", "TrigMatching", "TruthMET_met", "TruthMET_phi", "TruthMET_sumet", "Vtx_n", "XbbScoreHiggs", "XbbScoreQCD", "XbbScoreTop", "actualInteractionsPerCrossing", "averageInteractionsPerCrossing", "bcid", "corr_avgIntPerX", "eventNumber", "isOppositeCharge", "lumiBlock", "mT_METclosestBJet", "mT_METfarestBJet", "m_J", "m_jj", "m_jj_corr", "m_ll", "muWeight", "mu_density", "pt_ll", "runNumber", "sigjet012ptsum"};
ROOT::RDataFrame("MonoH_Nominal",input_files).
Snapshot<Int_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Double_t, Double_t,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<int>,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<float>, Double_t, Double_t, Float_t, Float_t, Char_t, Char_t, Char_t, Char_t, Char_t, Double_t, Double_t, Double_t, Float_t, Int_t, Bool_t, Float_t, Int_t, Bool_t,std::vector<int>,std::vector<double>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<int>,std::vector<char>,std::vector<float>,std::vector<float>, Double_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Double_t, Double_t, Double_t, Double_t, Double_t,std::vector<float>,std::vector<float>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>,std::vector<char>, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, Int_t, UInt_t, UInt_t, Int_t, Double_t, Double_t, Int_t, Bool_t, Bool_t, Int_t, Bool_t, Bool_t,std::vector<int>,std::vector<double>,std::vector<char>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<char>,std::vector<float>,std::vector<float>, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Char_t, Float_t, Float_t, Float_t, Int_t, Float_t, Float_t, Float_t, Float_t, Float_t, UInt_t, Float_t, ULong64_t, Bool_t, UInt_t, Float_t, Float_t, Float_t, Float_t, Float_t, Float_t, Double_t, Float_t, Float_t, UInt_t, Float_t>
("test", "test_out.root", columnList)
.GetValue();
}
}
Looking at the stacktraces it's clear that the problem is interleaved execution of RDataFrame TBB tasks (a RDF task starts and take a slot number, then ROOT::Internal::TBranchIMTHelper::Wait triggers the start of an inner RDF task that takes a second slot number without returning the first one, leading to an error in builds with asserts enabled, and undefined behavior in builds without asserts). Attached the stacktrace of the relevant thread (not the one that errors out, but the one that steals the last slot number so another thread doesn't find it and errors out).
ROOT version
any
Installation method
any
Operating system
any
Additional context
No response