Description
Describe the bug
When an IntCategory axis with growth=False and overflow=False is filled with values that fall outside the defined bins (which should be a valid combination, and is useful in certain contexts to 'ignore' invalid values without explicit masking of them), the result can be 0 for the entire histogram, at least when a StrCategory axis and Weight storage are used. This directly appears with using named versions of these axes types in scikit-hep/hist, and produces a stranger error in dask_histogram.
Steps to reproduce
This also includes the reproducer for hist and dask_histogram, to demonstrate some of the discrepancies
import awkward as ak
import dask_awkward as dak
import numpy as np
import hist
from hist.hist import Hist as CHist
from hist.dask import Hist as DHist
import boost_histogram as bh
from boost_histogram import Histogram as BHist
import dask
include_out_of_bounds_bin = False
mpc = ak.from_iter([-1, 93, 185, 74, 48, 67, 231, 128, 170, 213, 120, 17]*1000)
ptc = ak.Array(np.random.normal(loc=200, scale=40, size=len(mpc)))
model_pointlazy = dak.from_awkward(mpc, npartitions=33)
ptlazy = dak.from_awkward(ptc, npartitions=33)
wgtlazy = ak.ones_like(model_pointlazy, dtype=float)
if include_out_of_bounds_bin:
axvals = [-1, 93, 185, 74, 48, 67, 231, 128, 170, 213, 120, 17]
else:
axvals = [93, 185, 74, 48, 67, 231, 128, 170, 213, 120, 17]
common = [
hist.axis.StrCategory(['nominal'], name='systematic', label='Systematic'),
#hist.axis.Regular(100, 20, 520, name='pt', label=r'1st Jet $p_T$ [GeV]'),
hist.storage.Weight(),
]
axesstor = [
hist.axis.IntCategory(axvals,
name="model_point",
label="{'uncategorized': -1, 'TChiZH_200_74': 93, 'TChiZH_250_124': 185, 'TChiZH_300_150': 74, 'TChiZH_400_200': 48, 'TChiZH_450_300': 67, 'TChiZH_600_1': 231, 'TChiZH_600_400': 128, 'TChiZH_750_450': 170, 'TChiZH_800_200': 213, 'TChiZH_900_1': 120, 'TChiZH_1500_550': 17}::Gen Model",
growth=False,
flow=True),
] + common
axesstornoflow = [
hist.axis.IntCategory(axvals,
name="model_point",
label="{'uncategorized': -1, 'TChiZH_200_74': 93, 'TChiZH_250_124': 185, 'TChiZH_300_150': 74, 'TChiZH_400_200': 48, 'TChiZH_450_300': 67, 'TChiZH_600_1': 231, 'TChiZH_600_400': 128, 'TChiZH_750_450': 170, 'TChiZH_800_200': 213, 'TChiZH_900_1': 120, 'TChiZH_1500_550': 17}::Gen Model",
growth=False,
flow=False),
] + common
bint = bh.axis.IntCategory(axvals, growth=False, overflow=True)
bintnoflow = bh.axis.IntCategory(axvals, growth=False, overflow=False)
bstr = bh.axis.StrCategory(['nominal'])
bstor = bh.storage.Weight()
bhistunmasked, bhistmasked = BHist(bint, bstr, storage=bstor), BHist(bint, bstr, storage=bstor)
bhistunmaskednoflow, bhistmaskednoflow = BHist(bintnoflow, bstr, storage=bstor), BHist(bintnoflow, bstr, storage=bstor)
chistunmasked, chistmasked = CHist(*axesstor), CHist(*axesstor)
chistunmaskednoflow, chistmaskednoflow = CHist(*axesstornoflow), CHist(*axesstornoflow)
dhistunmasked, dhistmasked = DHist(*axesstor), DHist(*axesstor)
dhistunmaskednoflow, dhistmaskednoflow = DHist(*axesstornoflow), DHist(*axesstornoflow)
msklazy = (model_pointlazy > -1)
dhistmasked.fill(**{
"model_point": model_pointlazy[msklazy],
"systematic": "nominal",
"weight": wgtlazy[msklazy],
})
dhistunmasked.fill(**{
"model_point": model_pointlazy,
"systematic": "nominal",
"weight": wgtlazy,
})
dhistmaskednoflow.fill(**{
"model_point": model_pointlazy[msklazy],
"systematic": "nominal",
"weight": wgtlazy[msklazy],
})
dhistunmaskednoflow.fill(**{
"model_point": model_pointlazy,
"systematic": "nominal",
"weight": wgtlazy,
})
tocompute = {"dhistmasked": dhistmasked, "dhistunmasked": dhistunmasked,
"dhistmaskednoflow": dhistmaskednoflow, "dhistunmaskednoflow": dhistunmaskednoflow,
"pt": ptlazy, "model_point": model_pointlazy, "wgt": wgtlazy}
comped = dask.compute(tocompute)[0]
model_point, pt, wgt = comped["model_point"], comped["pt"], comped["wgt"]
msk = (model_point > -1)
chistmasked.fill(**{
"model_point": model_point[msk],
"systematic": "nominal",
"weight": wgt[msk],
})
chistunmasked.fill(**{
"model_point": model_point,
"systematic": "nominal",
"weight": wgt,
})
chistmaskednoflow.fill(**{
"model_point": model_point[msk],
"systematic": "nominal",
"weight": wgt[msk],
})
chistunmaskednoflow.fill(**{
"model_point": model_point,
"systematic": "nominal",
"weight": wgt,
})
bhistmasked.fill(
model_point[msk],
"nominal",
weight=wgt[msk],
)
bhistunmasked.fill(
model_point,
"nominal",
weight=wgt,
)
bhistmaskednoflow.fill(
model_point[msk],
"nominal",
weight=wgt[msk],
)
bhistunmaskednoflow.fill(
model_point,
"nominal",
weight=wgt,
)
print_flow = False
print("histogram | BoostHistogram | Hist | DaskHistogram")
for key in ["histmasked", "histmaskednoflow", "histunmasked", "histunmaskednoflow"]:
ckey = f"c{key}"
dkey = f"d{key}"
bkey = f"b{key}"
b = eval(bkey)
c = eval(ckey)
d = comped[dkey]
print(key, b.sum().value, c.sum(flow=print_flow).value, d.sum(flow=print_flow).value)
This produces the following output, where the unmasked fill into the histogram without an overflow on the IntCategory produces 0 sum for (boost-)hist; and a junk-like value in DaskHistogram (presumably the partitioned fill plays an additional role here)
histogram | BoostHistogram | Hist | DaskHistogram
histmasked 11000.0 11000.0 11000.0
histmaskednoflow 11000.0 11000.0 11000.0
histunmasked 11000.0 11000.0 11000.0
histunmaskednoflow 0.0 0.0 7337.0
This bug was reproduced in 1.4.1 of boost-histogram, on an M1 Max chip, using MacOS 14.4; it was originally discovered on AlmaLinux8 (on Fermilab's LPC cluster) using a coffea container. Other principle libraries used are a mix of development versions corresponding mostly to the latest git version from approximately a month ago, and can be provided if necessary, but given the reproduction across different environments, it seems to be a relatively 'stable' bug