Skip to content

Commit f5d9d72

Browse files
committed
[df] Preserve schema of empty output TTree in Snapshot
Previously, the behaviour of Snapshot in case no entries pass the selections in the event loop was two-fold: * In single thread, Snapshot created an output file with an empty TTree inside, without the user-requested branches. * In multi-threaded mode, Snapshot created an output file, without any TTree inside. This commit aligns the behaviour of the two execution modes. Furthermore, it ensures the user-requested dataset schema is preserved even when the output TTree is empty. i.e., all the user-requested branches will be present in the output TTree.
1 parent 324b361 commit f5d9d72

8 files changed

+455
-16
lines changed

Diff for: tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx

+60-9
Original file line numberDiff line numberDiff line change
@@ -1503,6 +1503,10 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, const std::string &i
15031503
}
15041504
}
15051505

1506+
void SetEmptyBranchesHelper(TTree *inputTree, TTree &outputTree, RBranchSet &outputBranches,
1507+
const std::string &inputBranchName, const std::string &outputBranchName,
1508+
const std::type_info &typeInfo, int basketSize);
1509+
15061510
/// Ensure that the TTree with the resulting snapshot can be written to the target TFile. This means checking that the
15071511
/// TFile can be opened in the mode specified in `opts`, deleting any existing TTrees in case
15081512
/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
@@ -1618,6 +1622,18 @@ public:
16181622
(void)expander; // avoid unused variable warnings for older compilers such as gcc 4.9
16191623
}
16201624

1625+
template <std::size_t... S>
1626+
void SetEmptyBranches(TTree *inputTree, TTree &outputTree, std::index_sequence<S...>)
1627+
{
1628+
RBranchSet outputBranches{};
1629+
// We use the expander trick rather than a fold expression to avoid incurring in the bracket depth limit of clang
1630+
int expander[] = {(SetEmptyBranchesHelper(inputTree, outputTree, outputBranches, fInputBranchNames[S],
1631+
fOutputBranchNames[S], typeid(ColTypes), fOptions.fBasketSize),
1632+
0)...,
1633+
0};
1634+
(void)expander;
1635+
}
1636+
16211637
void Initialize()
16221638
{
16231639
fOutputFile.reset(
@@ -1648,6 +1664,12 @@ public:
16481664
assert(fOutputTree != nullptr);
16491665
assert(fOutputFile != nullptr);
16501666

1667+
// There were no entries to fill the TTree with (either the input TTree was empty or no event passed after
1668+
// filtering). We have already created an empty TTree, now also create the branches to preserve the schema
1669+
if (fOutputTree->GetEntries() == 0) {
1670+
using ind_t = std::index_sequence_for<ColTypes...>;
1671+
SetEmptyBranches(fInputTree, *fOutputTree, ind_t{});
1672+
}
16511673
// use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
16521674
fOutputTree->AutoSave("flushbaskets");
16531675
// must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
@@ -1715,6 +1737,7 @@ class R__CLING_PTRCHECK(off) SnapshotTTreeHelperMT : public RActionImpl<Snapshot
17151737
std::vector<bool> fIsDefine;
17161738
ROOT::Detail::RDF::RLoopManager *fOutputLoopManager;
17171739
ROOT::Detail::RDF::RLoopManager *fInputLoopManager;
1740+
TFile *fOutputFile; // Non-owning view on the output file
17181741

17191742
public:
17201743
using ColumnTypes_t = TypeList<ColTypes...>;
@@ -1847,39 +1870,67 @@ public:
18471870
(void)expander; // avoid unused parameter warnings (gcc 12.1)
18481871
}
18491872

1873+
template <std::size_t... S>
1874+
void SetEmptyBranches(TTree *inputTree, TTree &outputTree, std::index_sequence<S...>)
1875+
{
1876+
RBranchSet outputBranches{};
1877+
// We use the expander trick rather than a fold expression to avoid incurring in the bracket depth limit of clang
1878+
int expander[] = {(SetEmptyBranchesHelper(inputTree, outputTree, outputBranches, fInputBranchNames[S],
1879+
fOutputBranchNames[S], typeid(ColTypes), fOptions.fBasketSize),
1880+
0)...,
1881+
0};
1882+
(void)expander;
1883+
}
1884+
18501885
void Initialize()
18511886
{
18521887
const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
1853-
auto out_file = TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs);
1854-
if(!out_file)
1888+
auto outFile = std::unique_ptr<TFile>{
1889+
TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)};
1890+
if (!outFile)
18551891
throw std::runtime_error("Snapshot: could not create output file " + fFileName);
1856-
fMerger = std::make_unique<ROOT::TBufferMerger>(std::unique_ptr<TFile>(out_file));
1892+
fOutputFile = outFile.get();
1893+
fMerger = std::make_unique<ROOT::TBufferMerger>(std::move(outFile));
18571894
}
18581895

18591896
void Finalize()
18601897
{
18611898
assert(std::any_of(fOutputFiles.begin(), fOutputFiles.end(), [](const auto &ptr) { return ptr != nullptr; }));
18621899

1863-
auto fileWritten = false;
18641900
for (auto &file : fOutputFiles) {
18651901
if (file) {
18661902
file->Write();
18671903
file->Close();
1868-
fileWritten = true;
18691904
}
18701905
}
18711906

1872-
if (!fileWritten) {
1873-
Warning("Snapshot",
1874-
"No input entries (input TTree was empty or no entry passed the Filters). Output TTree is empty.");
1907+
// If there were no entries to fill the TTree with (either the input TTree was empty or no event passed after
1908+
// filtering), create an empty TTree in the output file and create the branches to preserve the schema
1909+
auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
1910+
assert(fOutputFile && "Missing output file in Snapshot finalization.");
1911+
if (!fOutputFile->Get(fullTreeName.c_str())) {
1912+
1913+
// First find in which directory we need to write the output TTree
1914+
TDirectory *treeDirectory = fOutputFile;
1915+
if (!fDirName.empty()) {
1916+
treeDirectory = fOutputFile->mkdir(fDirName.c_str(), "", true);
1917+
}
1918+
::TDirectory::TContext c{treeDirectory};
1919+
1920+
// Create the output TTree and create the user-requested branches
1921+
auto outTree =
1922+
std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
1923+
using ind_t = std::index_sequence_for<ColTypes...>;
1924+
SetEmptyBranches(fInputLoopManager->GetTree(), *outTree, ind_t{});
1925+
1926+
fOutputFile->Write();
18751927
}
18761928

18771929
// flush all buffers to disk by destroying the TBufferMerger
18781930
fOutputFiles.clear();
18791931
fMerger.reset();
18801932

18811933
// Now connect the data source to the loop manager so it can be used for further processing
1882-
auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
18831934
fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
18841935
}
18851936

Diff for: tree/dataframe/inc/ROOT/RDF/RInterface.hxx

+5-6
Original file line numberDiff line numberDiff line change
@@ -1270,14 +1270,13 @@ public:
12701270
/// sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed).
12711271
///
12721272
/// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of
1273-
/// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled with
1274-
/// respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in wrong
1275-
/// associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
1273+
/// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled
1274+
/// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in
1275+
/// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
12761276
/// error out if such a "shuffled" TTree is used in a friendship.
12771277
///
1278-
/// \note In case no events are written out (e.g. because no event passes all filters) the behavior of Snapshot in
1279-
/// single-thread and multi-thread runs is different: in single-thread runs, Snapshot will write out a TTree with
1280-
/// the specified name and zero entries; in multi-thread runs, no TTree object will be written out to disk.
1278+
/// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the
1279+
/// requested output TTree to the file, with all the branches requested to preserve the dataset schema.
12811280
///
12821281
/// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns
12831282
/// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are

Diff for: tree/dataframe/src/RDFActionHelpers.cxx

+97
Original file line numberDiff line numberDiff line change
@@ -297,3 +297,100 @@ void EnsureValidSnapshotRNTupleOutput(const RSnapshotOptions &opts, const std::s
297297
} // end NS RDF
298298
} // end NS Internal
299299
} // end NS ROOT
300+
301+
namespace {
302+
void CreateCStyleArrayBranch(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches,
303+
const std::string &inputBranchName, const std::string &outputBranchName, int basketSize)
304+
{
305+
TBranch *inputBranch = nullptr;
306+
if (inputTree) {
307+
inputBranch = inputTree->GetBranch(inputBranchName.c_str());
308+
if (!inputBranch) // try harder
309+
inputBranch = inputTree->FindBranch(inputBranchName.c_str());
310+
}
311+
if (!inputBranch)
312+
return;
313+
const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName());
314+
if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec)
315+
return;
316+
// must construct the leaflist for the output branch and create the branch in the output tree
317+
const auto *leaf = static_cast<TLeaf *>(inputBranch->GetListOfLeaves()->UncheckedAt(0));
318+
if (!leaf)
319+
return;
320+
const auto bname = leaf->GetName();
321+
auto *sizeLeaf = leaf->GetLeafCount();
322+
const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic());
323+
324+
// We proceed only if branch is a fixed-or-variable-sized array
325+
if (sizeLeaf || leaf->GetLenStatic() > 1) {
326+
if (sizeLeaf && !outputBranches.Get(sizeLeafName)) {
327+
// The output array branch `bname` has dynamic size stored in leaf `sizeLeafName`, but that leaf has not been
328+
// added to the output tree yet. However, the size leaf has to be available for the creation of the array
329+
// branch to be successful. So we create the size leaf here.
330+
const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName());
331+
// Use Original basket size for Existing Branches otherwise use Custom basket Size.
332+
const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize();
333+
auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast<void *>(nullptr),
334+
(sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize);
335+
outputBranches.Insert(sizeLeafName, outputBranch);
336+
}
337+
338+
const auto btype = leaf->GetTypeName();
339+
const auto rootbtype = ROOT::Internal::RDF::TypeName2ROOTTypeName(btype);
340+
if (rootbtype == ' ') {
341+
Warning("Snapshot",
342+
"RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The "
343+
"leaf is of type '%s'. This column will not be written out.",
344+
bname, btype);
345+
return;
346+
}
347+
348+
const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype;
349+
// Use original basket size for existing branches and new basket size for new branches
350+
const auto bufSize = (basketSize > 0) ? basketSize : inputBranch->GetBasketSize();
351+
auto *outputBranch =
352+
outputTree.Branch(outputBranchName.c_str(), static_cast<void *>(nullptr), leaflist.c_str(), bufSize);
353+
outputBranch->SetTitle(inputBranch->GetTitle());
354+
outputBranches.Insert(outputBranchName, outputBranch);
355+
}
356+
}
357+
} // namespace
358+
359+
void ROOT::Internal::RDF::SetEmptyBranchesHelper(TTree *inputTree, TTree &outputTree,
360+
ROOT::Internal::RDF::RBranchSet &outputBranches,
361+
const std::string &inputBranchName,
362+
const std::string &outputBranchName, const std::type_info &typeInfo,
363+
int basketSize)
364+
{
365+
const auto bufSize = (basketSize > 0) ? basketSize : 32000;
366+
auto *classPtr = TClass::GetClass(typeInfo);
367+
if (!classPtr) {
368+
// Case of a leaflist of fundamental type, logic taken from
369+
// TTree::BranchImpRef(const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize,
370+
// Int_t splitlevel)
371+
auto typeName = ROOT::Internal::RDF::TypeID2TypeName(typeInfo);
372+
auto rootTypeChar = ROOT::Internal::RDF::TypeName2ROOTTypeName(typeName);
373+
if (rootTypeChar == ' ') {
374+
Warning(
375+
"Snapshot",
376+
"RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This "
377+
"column will not be written out.",
378+
outputBranchName.c_str());
379+
return;
380+
}
381+
std::string leafList{outputBranchName + '/' + rootTypeChar};
382+
auto *outputBranch =
383+
outputTree.Branch(outputBranchName.c_str(), static_cast<void *>(nullptr), leafList.c_str(), bufSize);
384+
outputBranches.Insert(outputBranchName, outputBranch);
385+
return;
386+
}
387+
388+
// Find if there is an input branch, check for cases where we need a leaflist (e.g. C-style arrays)
389+
CreateCStyleArrayBranch(inputTree, outputTree, outputBranches, inputBranchName, outputBranchName, basketSize);
390+
391+
// General case
392+
if (!outputBranches.Get(outputBranchName)) {
393+
auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize);
394+
outputBranches.Insert(outputBranchName, outputBranch);
395+
}
396+
}

Diff for: tree/dataframe/test/CMakeLists.txt

+8-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,14 @@ if(MSVC)
4848
endif()
4949

5050
if(NOT (MSVC OR (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES arm64)) OR win_broken_tests OR M1_BROKEN_TESTS)
51-
ROOT_ADD_GTEST(dataframe_snapshot dataframe_snapshot.cxx LIBRARIES ROOTDataFrame)
51+
ROOT_ADD_GTEST(dataframe_snapshot dataframe_snapshot.cxx LIBRARIES ROOTDataFrame GenVector)
52+
endif()
53+
54+
if(NOT MSVC OR win_broken_tests)
55+
ROOT_ADD_GTEST(dataframe_snapshot_emptyoutput dataframe_snapshot_emptyoutput.cxx LIBRARIES ROOTDataFrame GenVector)
56+
ROOT_GENERATE_DICTIONARY(DummyDict ${CMAKE_CURRENT_SOURCE_DIR}/DummyHeader.hxx
57+
MODULE dataframe_snapshot_emptyoutput LINKDEF DummyHeaderLinkDef.hxx OPTIONS -inlineInputHeader
58+
DEPENDENCIES ROOTVecOps GenVector)
5259
endif()
5360

5461
ROOT_ADD_GTEST(dataframe_datasetspec dataframe_datasetspec.cxx LIBRARIES ROOTDataFrame)

Diff for: tree/dataframe/test/DummyHeader.hxx

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#ifndef RDF_TEST_DUMMYHEADER
2+
#define RDF_TEST_DUMMYHEADER
3+
4+
#include <ROOT/RVec.hxx>
5+
#include <Math/Vector4D.h>
6+
7+
#endif // RDF_TEST_DUMMYHEADER

Diff for: tree/dataframe/test/DummyHeaderLinkDef.hxx

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#ifdef __CLING__
2+
3+
#pragma link C++ class ROOT::RVec < ROOT::Math::PtEtaPhiMVector> + ;
4+
5+
#endif

Diff for: tree/dataframe/test/dataframe_snapshot.cxx

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include <TTreeReader.h>
1616
#include <TTreeReaderArray.h>
1717

18+
#include "DummyHeader.hxx"
19+
1820
using namespace ROOT; // RDataFrame
1921
using namespace ROOT::RDF; // RInterface
2022
using namespace ROOT::VecOps; // RVec

0 commit comments

Comments
 (0)