Skip to content

uproot 8 times slower to open many histograms #1417

Open
@wiso

Description

@wiso

I have a file with tons of histograms, and I need to read almost all of them.
The first step is to select only histograms. uproot is quite faster here, probably since I don't need recursion. Maybe you can help to find a better way to implement

ROOT.TClass(key.GetClassName()).InheritsFrom(ROOT.TH1.Class())

by the way, my problem is that when looping on all the histograms uproot is much slower.
The input ROOT file is here: https://cernbox.cern.ch/s/vXi8Dx9cVWx8Net

uproot code:

import time
from collections.abc import Iterable
import uproot
#import logging
#logging.basicConfig(level=0)

def list_object(root_file, selection, strip_cycle=True) -> list[str]:
    if isinstance(root_file, str):
        root_file = uproot.open(root_file)
    keys = [key for (key, classname) in root_file.classnames().items() if selection(key, classname)]
    if strip_cycle:
        keys = [key.split(";")[0] for key in keys]
    return keys


def select_histo(key: str, classname: str) -> bool:
    # return issubclass(type(item), uproot.behaviors.TH1.TH1)

    return classname.startswith("TH1") or classname.startswith("TH2") or classname.startswith("TProfile")


def list_histograms(root_file, strip_cycle=True) -> list[str]:
    if not root_file:
        return []
    return list_object(root_file, select_histo, strip_cycle=strip_cycle)

def iter_common_histograms(root_file, common_histograms:Iterable[str]):
    f = uproot.open(root_file)
    for histogram_name in common_histograms:
        histograms = []
        h = f.get(histogram_name)
        if h is None:
            raise ValueError(f"Histogram {histogram_name} not found in {f}.")
        yield histogram_name, h


fn = "NTUP_PHYSVAL.43258939._000001.pool.root.1"
start_time = time.perf_counter()
hnames = list_histograms(fn)
print(len(hnames))
print(time.perf_counter() - start_time)

start_time = time.perf_counter()
for h in iter_common_histograms(fn, hnames):
    pass
print(time.perf_counter() - start_time)

ROOT code

import time
import ROOT

def list_object(root_file: str | ROOT.TFile, selection) -> list[str]:
    if isinstance(root_file, str):
        root_file = ROOT.TFile(root_file)
    if not root_file or root_file.IsZombie():
        return []

    obj_paths = []

    def find_objects(directory, path=""):
        keys = directory.GetListOfKeys()
        for key in keys:
            obj_name = key.GetName()
            obj_path = f"{path}/{obj_name}" if path else obj_name

            if ROOT.TClass(key.GetClassName()).InheritsFrom(ROOT.TDirectory.Class()):
                directory = key.ReadObj()
                find_objects(directory, obj_path)
            elif selection(key):
                obj_paths.append(obj_path)

    find_objects(root_file)

    return obj_paths


def select_histo(key: ROOT.TKey) -> bool:
    return ROOT.TClass(key.GetClassName()).InheritsFrom(ROOT.TH1.Class())


def list_histograms(root_file: str | ROOT.TFile) -> list[str]:
    return list_object(root_file, select_histo)

def iter_common_histograms(root_file, common_histograms: list[str]):
    f = ROOT.TFile.Open(root_file)

    for histogram_name in common_histograms:
        h = f.Get(histogram_name)
        if not h:
            raise ValueError(f"Histogram {histogram_name} not found in {f.GetName()}.")
        h.SetDirectory(0)
        yield histogram_name, h
        h.Delete()


fn = "NTUP_PHYSVAL.43258939._000001.pool.root.1"
start_time = time.perf_counter()
hnames = list_histograms(fn)
print(len(hnames))
print(time.perf_counter() - start_time)
start_time = time.perf_counter()
for h in iter_common_histograms(fn, hnames):
    pass
print(time.perf_counter() - start_time)

Results uproot

16816
0.43960302799951023
16.232253045000107

Results ROOT

16816
0.6675683060002484
2.048757989000478

iter_common_histograms seems quite unuseful, but in my real case it is more complicated.
If I uncomment the logging lines I see tons of

DEBUG:fsspec.local:open file: /home/turra/backend-validation/samples/6c/f1/valid1.801168.Py8EG_A14NNPDF23LO_jj_JZ3.merge.NTUP_PHYSVAL.e8514_s4479_s4377_r16304_p6223_p6224_p6225_tid43258939_00/NTUP_PHYSVAL.43258939._000001.pool.root.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    bug (unverified)The problem described would be a bug, but needs to be triaged

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions