-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathunfold.py
More file actions
322 lines (275 loc) · 11.7 KB
/
unfold.py
File metadata and controls
322 lines (275 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
from __future__ import print_function, division
import numpy as np
import scipy.interpolate as sci
import scipy.optimize as sco
class Blobel():
"""
Unfolding method after Blobel.
Used names in this class:
- f0(x): True distribution before detection MC simulation. Used to build
the response matrix A by mapping f0(x) to g(y).
- g(y): Measured MC distribution after running the simulation on f0(x).
Used to build the response matrix A by mapping f0(x) to g(y).
- f(x): Unfolded target function of the true distribution. f(x) is build
from the basis functions used to describe f0(x) and the optimal
coefficients aj calculated by the likelihodd fit.
Parameters
----------
bins_meas : array-like
Array which sets the binning for all functions dependent on
the measured varible y (measured MC). The first and last
values define the range of y in which the unfolding operates.
bins_unfold : array-like
Array which sets the binning of the target function f(x)
AFTER the unfolding. The binning is applied to the spline
representation of f(x) to get single points with error
estimation. The first and last values define the range
of x in which the target function is defined.
t : array-like
Array containing the inner knots for the (cubic) bspline
basis functions which are used to represent f0(x). The
spline representation is used to discretize the true MC
f0(x) to build the response matrix Aij. The necessary outer
knots are created by repeating the outermost knots
spline_degree times.
"""
def __init__(self, bins_meas, bins_unfold, t):
self.bins_meas = bins_meas
self.bins_unfold = bins_unfold
# Resulting number of bins
self.n_bins_meas = len(self.bins_meas) - 1
self.n_bins_unfold = len(self.bins_unfold) - 1
# Use cubic splines to represent f0(x) and f(x)
self.spline_deg = 3
## Add necessary outer knots by repeating the first and last knot spline_deg times. Used from splines/bsplines.py.
# Switch the outer knots creation technique
extend = True
pre = np.zeros(self.spline_deg)
post = np.zeros(self.spline_deg)
if extend:
# Extend internal knot t1 at t0 and tn-1 at tn, with increasing distance t1-t0, tn-tn-1 times spline_deg
pre = [(t[0] - (i+1)*(t[1]-t[0])) for i in range(self.spline_deg) ][::-1]
post = [(t[-1] + (i+1)*(t[-1]-t[-2])) for i in range(self.spline_deg)]
else:
# Just repeat the outermost knots p times each
pre.fill(t[0])
post.fill(t[-1])
# Save all knots in one knot array
self.spline_knots = np.concatenate((pre, t, post))
# Number of splines (= number of coefficients aj) used to represent f0(x) and f(x)
self.n_splines = len(self.spline_knots) - self.spline_deg - 1
def build_response_matrix(self, mc_meas, mc_truth):
"""
Builds the response matrix Aij.
i is the number of bins_meas and
j is the number of spline_knots. To get Aij we use the MC truth
and measured MC to track how the BASIS FUNCTIONS pj(x) are mapped to
g(y). This is equivalent to setting all aj = 1. Therefore, the resulting
response matrix is independent of the basis coefficients aj which
represent the true MC distribution f0(x).
The aj are later determined in a likelihood fit to get the best
possible mapping from g(y) to f(x) via A.
Parameters
----------
mc_truth : array-like
True data before the MC detection simulation. f0(x) is
the assumed distribution of the true data.
mc_meas : array-like
MC data after the detection simulation resulting from
mc_truth. g(y) is the measured distribution obtained
from simulation based on the true distribution f0(x) and
describes the measured values obtained from the detector.
Returns
-------
A : array-like
Response matrix, describing the mapping from the basis functions
pj(x) of the true distribution f0(x) = sum_j aj*pj(x) to the
measured distribution g(y). The mapping function is:
vec{g}(y) = mat{A} * vec{p}
or in coordinate form:
gi(y) = sum_j (Aij * pj)
"""
## Init response matrix Aij
# Number of rows is determined by the binning in y of the measured distribution
rows = len(self.bins_meas) - 1
# Each coloum j represents one basis function pj(x), so the number of splines is the number of columns
cols = self.n_splines
self.A = np.zeros([rows, cols])
## Calculate the spline representation of the MC truth f0(x).
# Every entry Aij of A is the probability that the component fj of the distribution of the true variable x is mapped to the component gi(y) of the distribution of the measured variable y.
# Now f0(x) is described by a sum of basis functions pj. Therefore every column Aj is a histogram of mc_meas in y with binning bins_meas, as if f0(x) would be described by only one single basis function pj. So we and up with as many columns as there are basis function used to describe f0(x).
# To get the correct representation, set the histogram weights proportional to the basis functions pj. Because sum_j pj = 1, if we add up all hisotgrams Aj unnormalized, we would get the original distribution mc_meas. But as Aij are probabilities we normalize A so that the row entries add up to one in every row. This way there are noc missing entries in gi = sum_j Aij fj.
# The correct weights (spline coefficients) aj, which really describe f0(x) are found later with the maximum likelihood fit. Here we only create a histogram for every basis function to describe the mapping probabilities. This also makes sure, that the unfolding procedure is independent of the MC truth used to build the response matrix.
self.spline_coeff = np.zeros(cols)
for j in range(self.n_splines):
# Enable only the jth basis spline
self.spline_coeff.fill(0)
self.spline_coeff[j] = 1
# Set the hist weights for the histogram filled in Aj(y) proportional to pj(x). ext=1 cuts off to 0 outside the knots.
tck = (self.spline_knots, self.spline_coeff, self.spline_deg)
basis_hist_weights = sci.splev(mc_truth, tck, ext=1)
# Bin the measured MC with basis_hist_weights according to f0(x)=pj(x). The first and last entry in bins_meas define the borders, points outside are ignored by numpy.histogram.
hist, _ = np.histogram(
mc_meas,
bins=self.bins_meas,
density=False,
weights=basis_hist_weights
)
# Fill the created histogram to the jth column of A. In every column j of A is the binned distribution of g(y) weighted with pj(x).
self.A[:, j] = hist
# Normalization constant to normalize g(y) to 1 to represent a pdf. So every entry in A has to be divided by this number.
# norm = np.dot(
# np.histogram(mc_meas, self.bins_meas, density=False)[0],
# np.diff(self.bins_meas)
# )
# self.A /= norm
# Normalization ansatz by Max Noethe:
rowsums = self.A.sum(axis=1)[:,np.newaxis]
# binwidth = np.diff(self.bins_meas)[:,np.newaxis]
self.A = self.A / rowsums
# self.A = self.A * binwidth
return self.A
def fit_basis_coefficents(self, mc_meas):
"""
Returns the optimal fitting spline coefficents which represent the
unfolded distribution f(x).
The response matrix was created before with the assumption that
aj = 1 for every j. Therefore we get the response matrix A decoupled
from the coefficients aj. The aj are now chosen via a likelihood fit
to describe best the distribution g(y) given by the matrix mapping
equation:
vec{g}(y) = mat{A} * vec{a}
or in coordinate form:
gi(y) = sum_j (Aij * aj)
The whole information about the true MC distribution f0(x) is already
included in the response matrix A at this stage.
Parameters
----------
mc_meas : array-like
MC data after the detection simulation resulting from
mc_truth. g(y) is the measured distribution obtained
from simulation based on the true distribution f0(x) and
describes the measured values obtained from the detector.
Returns
-------
tck : tuple
Tuple of (spline_knots, spline_true_coeff, spline_degree) from
the fit describing optimally the mapping from f(x) to g(y). Use
scipy.splev(x, tck, ext=1)
to obtain the unfolded true function f(x).
"""
# Get the binned measured values gi(y) from the measured MC distribtuion
g_meas, _ = np.histogram(
mc_meas,
bins=self.bins_meas,
density=False
)
# Fit coefficients a, starting from aj = 1 for every j. Coefficients must be positive.
# bounds = [[0, None] for i in range(self.n_splines)]
x0 = np.ones_like(self.spline_coeff)
opt_res = sco.minimize(
# self.chisquared,
self.negllh,
x0=x0,
args=(g_meas),
# jac=False,
# bounds=bounds,
method="Nelder-Mead"
)
# Get the basis function coefficients from the fit result
spline_true_coeff = opt_res.x
# Return tck tuple for scipy.interpolate.splev. The tuple tck completely describes the unfolded function f(x) by its spline representation.
return (self.spline_knots, spline_true_coeff, self.spline_deg)
def chisquared(self, a, g_meas):
"""
Returns the chi-square function value for given values
of the basis function coefficients aj and the measured
values g_meas.
The fitted values g_fitted are calculated by the
matrix equation:
vec{g}_fitted(a) = mat{A} * vec{a}
Parameters
----------
a : array-like
Coefficent array at which the chi-square is evaluated.
g_meas : array-like
Contains measured MC values.
Returns
-------
chi2 : float
Single value of the chi-square function for the given parameters.
"""
# Only for unbounded methods: push up the negllh for parameters a<0 to effectively restrict to positive values of a
if np.any(a < 0):
return np.inf
# Calculate the value of g(y) for the current coefficents
g_fitted = np.dot(self.A, a)
# chi-square value to be minimized
return np.sum((g_meas - g_fitted)**2)
def negllh(self, a, g_meas):
"""
Returns the negative log-likelihood function value for given
values of the basis function coefficients aj and the measured
values g_meas.
The fitted values g_fitted are calculated by the
matrix equation:
vec{g}_fitted(a) = mat{A} * vec{a}
Parameters
----------
a : array-like
Coefficent array at which the chi-square is evaluated.
g_meas : array-like
Contains measured MC values.
Returns
-------
negllh : float
Single value of the neg log-likelihood function for the given
parameters.
"""
# Only for unbounded methods: push up the negllh for parameters a<0 to effectively restrict to positive values of a
if np.any(a < 0):
return np.inf
# Calculate the value of g(y) for the current coefficents
g_fitted = np.dot(self.A, a)
## Catch log(0) cases
# Init with 0. If none of the below is true this is either the case lim x->0 (x*log x) = 0 or 0*log(non-zero) = 0
measxlog = np.zeros_like(g_fitted)
for i in range(len(g_fitted)):
# Case: non-zero * log(0) = -inf -> return +inf for negllh
if (g_meas[i]>0) & (g_fitted[i]<=0):
return np.inf
# Case: if both values are non-zero/finite then just calculate normally
if (g_meas[i]>0) & (g_fitted[i]>0):
measxlog[i] = g_meas[i] * np.log(g_fitted[i])
return np.sum( g_fitted - measxlog )
def __str__(self):
"""
Returns a string containing pre-formatted information about the
unfolding parameters given at class creation.
"""
return "\n" \
"## Unfolding parameters: \n" \
"# Measured g(y) binning:\n" \
" {}\n" \
"# Resulting num of y bins: {}\n" \
"# Unfolded f(x) binning:\n" \
" {}\n" \
"# Resulting num of x bins: {}\n" \
"# Spline knots to represent f0(x):\n" \
" Inner: {}\n" \
" Outer: {}\n" \
"# Spline degree: {}\n" \
"# Resulting num of splines: {}\n" \
"".format(
self.bins_meas,
self.n_bins_meas,
self.bins_unfold,
self.n_bins_unfold,
self.spline_knots[self.spline_deg:-self.spline_deg],
np.concatenate((
self.spline_knots[:self.spline_deg],
self.spline_knots[-self.spline_deg:]
)),
self.spline_deg,
self.n_splines,
)