Open
Description
I have a file with tons of histograms, and I need to read almost all of them.
The first step is to select only histograms. uproot is quite faster here, probably since I don't need recursion. Maybe you can help to find a better way to implement
ROOT.TClass(key.GetClassName()).InheritsFrom(ROOT.TH1.Class())
by the way, my problem is that when looping on all the histograms uproot is much slower.
The input ROOT file is here: https://cernbox.cern.ch/s/vXi8Dx9cVWx8Net
uproot code:
import time
from collections.abc import Iterable
import uproot
#import logging
#logging.basicConfig(level=0)
def list_object(root_file, selection, strip_cycle=True) -> list[str]:
if isinstance(root_file, str):
root_file = uproot.open(root_file)
keys = [key for (key, classname) in root_file.classnames().items() if selection(key, classname)]
if strip_cycle:
keys = [key.split(";")[0] for key in keys]
return keys
def select_histo(key: str, classname: str) -> bool:
# return issubclass(type(item), uproot.behaviors.TH1.TH1)
return classname.startswith("TH1") or classname.startswith("TH2") or classname.startswith("TProfile")
def list_histograms(root_file, strip_cycle=True) -> list[str]:
if not root_file:
return []
return list_object(root_file, select_histo, strip_cycle=strip_cycle)
def iter_common_histograms(root_file, common_histograms:Iterable[str]):
f = uproot.open(root_file)
for histogram_name in common_histograms:
histograms = []
h = f.get(histogram_name)
if h is None:
raise ValueError(f"Histogram {histogram_name} not found in {f}.")
yield histogram_name, h
fn = "NTUP_PHYSVAL.43258939._000001.pool.root.1"
start_time = time.perf_counter()
hnames = list_histograms(fn)
print(len(hnames))
print(time.perf_counter() - start_time)
start_time = time.perf_counter()
for h in iter_common_histograms(fn, hnames):
pass
print(time.perf_counter() - start_time)
ROOT code
import time
import ROOT
def list_object(root_file: str | ROOT.TFile, selection) -> list[str]:
if isinstance(root_file, str):
root_file = ROOT.TFile(root_file)
if not root_file or root_file.IsZombie():
return []
obj_paths = []
def find_objects(directory, path=""):
keys = directory.GetListOfKeys()
for key in keys:
obj_name = key.GetName()
obj_path = f"{path}/{obj_name}" if path else obj_name
if ROOT.TClass(key.GetClassName()).InheritsFrom(ROOT.TDirectory.Class()):
directory = key.ReadObj()
find_objects(directory, obj_path)
elif selection(key):
obj_paths.append(obj_path)
find_objects(root_file)
return obj_paths
def select_histo(key: ROOT.TKey) -> bool:
return ROOT.TClass(key.GetClassName()).InheritsFrom(ROOT.TH1.Class())
def list_histograms(root_file: str | ROOT.TFile) -> list[str]:
return list_object(root_file, select_histo)
def iter_common_histograms(root_file, common_histograms: list[str]):
f = ROOT.TFile.Open(root_file)
for histogram_name in common_histograms:
h = f.Get(histogram_name)
if not h:
raise ValueError(f"Histogram {histogram_name} not found in {f.GetName()}.")
h.SetDirectory(0)
yield histogram_name, h
h.Delete()
fn = "NTUP_PHYSVAL.43258939._000001.pool.root.1"
start_time = time.perf_counter()
hnames = list_histograms(fn)
print(len(hnames))
print(time.perf_counter() - start_time)
start_time = time.perf_counter()
for h in iter_common_histograms(fn, hnames):
pass
print(time.perf_counter() - start_time)
Results uproot
16816
0.43960302799951023
16.232253045000107
Results ROOT
16816
0.6675683060002484
2.048757989000478
iter_common_histograms
seems quite unuseful, but in my real case it is more complicated.
If I uncomment the logging lines I see tons of
DEBUG:fsspec.local:open file: /home/turra/backend-validation/samples/6c/f1/valid1.801168.Py8EG_A14NNPDF23LO_jj_JZ3.merge.NTUP_PHYSVAL.e8514_s4479_s4377_r16304_p6223_p6224_p6225_tid43258939_00/NTUP_PHYSVAL.43258939._000001.pool.root.1