Skip to content

Commit 3eb8ab6

Browse files
committed
Copy of code from Jive
- Add pit - Add pit-tests Resolves #142
1 parent edf760d commit 3eb8ab6

File tree

6 files changed

+1151
-0
lines changed

6 files changed

+1151
-0
lines changed

src/scores/probability/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@
1111
crps_cdf_brier_decomposition,
1212
crps_for_ensemble,
1313
)
14+
from scores.probability.pit import pit
1415
from scores.probability.roc_impl import roc_curve_data

src/scores/probability/functions.py

+216
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from scores.probability.checks import (
1313
cdf_values_within_bounds,
1414
check_nan_decreasing_inputs,
15+
coords_increasing,
1516
)
1617
from scores.typing import XarrayLike
1718

@@ -397,3 +398,218 @@ def cdf_envelope(
397398
result.loc["lower"] = np.where(~np.isnan(cdf), cdf_lower, np.nan)
398399

399400
return result
401+
402+
403+
def check_cdf(
404+
cdf: xr.DataArray,
405+
threshold_dim: str,
406+
dims: Optional[Iterable[str]],
407+
varname_cdf: str = "cdf",
408+
varname_threshold_dim: str = "threshold_dim",
409+
varname_dims: str = "dims",
410+
):
411+
"""
412+
Performs various checks on a CDF values of a real-valued random variable
413+
in an array `cdf` and its dimensions. Does not check that `cdf` is nondecreasing
414+
along the `threshold_dim` dimension;
415+
use the function `nan_decreasing_cdfs` instead.
416+
417+
Args:
418+
cdf: array of CDF values to check.
419+
threshold_dim: name of the threshold dimension in `cdf`.
420+
dims: if not `None`, a subset of dimensions of `cdf`, not including
421+
`threshold_dim`.
422+
varname_cdf: the variable name for `cdf` used in the error message.
423+
varname_threshold_dim: the variable name for `threshold_dim` used in the error message.
424+
varname_dims: the variable name for `dims` used in the error message.
425+
426+
Raises:
427+
ValueError: if `threshold_dim` is not a dimension of `cdf`.
428+
ValueError: if `threshold_dim` is in `dims`.
429+
ValueError: if `dims` is not a subset of `cdf.dims`.
430+
ValueError: if `cdf` has non-NaN values outside the closed interval [0,1].
431+
ValueError: if `cdf[threshold_dim]` coordinates are not increasing.
432+
"""
433+
if threshold_dim not in cdf.dims:
434+
raise ValueError(f"`{varname_threshold_dim}` is not a dimension of `{varname_cdf}`")
435+
436+
if dims is not None:
437+
if threshold_dim in dims:
438+
raise ValueError(f"`{varname_threshold_dim}` is in `{varname_dims}`")
439+
if not set(dims).issubset(cdf.dims):
440+
raise ValueError(f"`{varname_dims}` is not a subset of dimensions of `{varname_cdf}`")
441+
442+
if not cdf_values_within_bounds(cdf):
443+
raise ValueError(f"values of `{varname_cdf}` must be in the closed interval [0,1]")
444+
445+
if not coords_increasing(cdf, threshold_dim):
446+
raise ValueError(f"coordinates along `{varname_threshold_dim}` of `{varname_cdf}` must be increasing")
447+
448+
449+
def check_cdf_support(
450+
cdf: xr.DataArray,
451+
threshold_dim: str = "threshold",
452+
lower_supp: float = 0,
453+
upper_supp: float = np.inf,
454+
varname_cdf: str = "cdf",
455+
varname_threshold_dim: str = "threshold_dim",
456+
):
457+
"""
458+
Checks that the CDF of a real-valued random variable is supported on
459+
the closed interval [lower_supp, upper_supp], i.e., that
460+
461+
- cdf(x) = 0 when x < `lower_support`
462+
- cdf(x) = 1 when x > `upper_support`
463+
464+
for x in `cdf[threshold_dim].values`.
465+
466+
NaNs are ignored. Tests for "equality" uses `np.allclose`.
467+
468+
Args:
469+
cdf: array of CDF values, with threshold dimension `threshold_dim`.
470+
threshold_dim: name of the threshold dimension in `cdf`.
471+
lower_supp: left endpoint of the interval of support. Can be `-np.inf`.
472+
upper_supp: right endpoint of the interval of support. Can be `np.inf`.
473+
varname_cdf: the variable name for `cdf` used in the error message.
474+
varname_threshold_dim: the variable name for `cdf` used in the error message.
475+
476+
Returns:
477+
True if all checks pass.
478+
479+
Raises:
480+
ValueError: if `lower_support` > `upper_support`.
481+
ValueError: if cdf(x) != 0 when x < `lower_support`.
482+
ValueError: if cdf(x) != 1 when x > `upper_support`.
483+
"""
484+
if upper_supp < lower_supp:
485+
raise ValueError("`upper_supp < lower_supp`")
486+
487+
upper = cdf.where(cdf[threshold_dim] > upper_supp).values
488+
lower = cdf.where(cdf[threshold_dim] < lower_supp).values
489+
490+
if not np.allclose(upper[~np.isnan(upper)], 1):
491+
raise ValueError(f"`{varname_cdf}` is not 1 when `{varname_cdf}[{varname_threshold_dim}] > {upper_supp}`")
492+
493+
if not np.allclose(lower[~np.isnan(lower)], 0):
494+
raise ValueError(f"`{varname_cdf}` is not 0 when `{varname_cdf}[{varname_threshold_dim}] < {lower_supp}`")
495+
496+
497+
def expectedvalue_from_cdf(
498+
cdf: xr.DataArray,
499+
threshold_dim: str = "threshold",
500+
nonnegative_support: bool = True,
501+
) -> xr.DataArray:
502+
"""
503+
Returns the expected value E(Y) of a real-valued random variable Y with
504+
distribution given by `cdf`.
505+
506+
The current implementation assumes that Y >= 0, i.e., the CDF has nonnnegative support.
507+
This assumption may be relaxed in a future implementation.
508+
509+
The calculation is based on the formula
510+
511+
E(Y) = integral(1 - cdf(x), x >=0),
512+
513+
where it is assumed that the CDF is piecewise linear between CDF values given by the
514+
array `cdf` when x >= 0.
515+
516+
Args:
517+
cdf: array of CDF values, with threshold dimension `threshold_dim`.
518+
threshold_dim: name of the threshold dimension in `cdf`.
519+
nonnegative_support: `True` if the support of the probability measure
520+
described by `cdf` is contained in the half line [0, inf).
521+
522+
Returns:
523+
xr.DataArray containing the expected value of each CDF in `cdf`, with
524+
`threshold_dim` collapsed.
525+
526+
Raises:
527+
ValueError: if `nonnegative_support = False`.
528+
ValueError: if `threshold_dim` is not a dimension of `cdf`.
529+
ValueError: if `cdf` has non-NaN values outside the closed interval [0,1].
530+
ValueError: if `cdf[threshold_dim]` coordinates are not increasing.
531+
ValueError: if `cdf` is not 0 when `cdf[threshold_dim] < 0` and
532+
`nonnegative_support = True`.
533+
"""
534+
if nonnegative_support:
535+
check_cdf_support(cdf, threshold_dim, 0, np.inf)
536+
else:
537+
raise ValueError("This function currently only handles the case when the cdf has nonnegative support")
538+
539+
check_cdf(cdf, threshold_dim, None, "cdf", "threshold_dim", "dims")
540+
541+
# only select CDF values where 0 <= threshold_dim.
542+
integrand = 1 - cdf.sel({threshold_dim: slice(0, np.inf)})
543+
544+
return integrand.integrate(threshold_dim)
545+
546+
547+
def variance_from_cdf(cdf: xr.DataArray, threshold_dim: str = "threshold") -> xr.DataArray:
548+
"""
549+
Returns the variance Var(Y) of a random variable Y with distribution given by `cdf`,
550+
under the assumption that Y >= 0. The calculation is based on the formula
551+
552+
Var(Y) = 2 * integral(x * (1 - cdf(x)), x >= 0) + E(Y)^2,
553+
554+
where it is assumed that the CDF is piecewise linear between CDF values given by the
555+
array `cdf` when x >= 0.
556+
557+
Args:
558+
cdf: array of CDF values, with threshold dimension `threshold_dim`.
559+
threshold_dim: name of the threshold dimension in `cdf`.
560+
561+
Returns:
562+
xr.DataArray containing the variance Var(Y) associated with each CDF in `cdf`,
563+
with `threshold_dim` collapsed.
564+
565+
Raises:
566+
ValueError: if `threshold_dim` is not a dimension of `cdf`.
567+
ValueError: if `cdf` has non-NaN values outside the closed interval [0,1].
568+
ValueError: if `cdf[threshold_dim]` coordinates are not increasing.
569+
ValueError: if `cdf` is not 0 when `cdf[threshold_dim] < 0`.
570+
"""
571+
check_cdf(cdf, threshold_dim, None, "cdf", "threshold_dim", "dims")
572+
check_cdf_support(cdf, threshold_dim, 0, np.inf)
573+
574+
expected_value = expectedvalue_from_cdf(cdf, threshold_dim)
575+
576+
# only use values of CDF where threshold >= 0.
577+
cdf = cdf.sel({threshold_dim: slice(0, np.inf)})
578+
integral = _var_from_cdf(cdf, threshold_dim)
579+
result = 2 * integral - expected_value**2
580+
581+
return result
582+
583+
584+
def _var_from_cdf(function_values, threshold_dim):
585+
"""
586+
Calculates the integral
587+
588+
integral(t * (1 - F(t)))
589+
590+
where it is assumed that the function F is piecewise linear between values in
591+
given by the array `function_values`.
592+
593+
If there is a NaN along `threshold_dim`, then NaN is returned.
594+
"""
595+
# notation: F_i(t) = m_i * t + b_i whenever x[i-1] <= t <= x[i]
596+
# and x[i] is a value in `function_values[threshold_dim]`.
597+
598+
x_values = function_values[threshold_dim]
599+
x_shifted = function_values[threshold_dim].shift(**{threshold_dim: 1})
600+
# difference in x values
601+
diff_xs = x_values - x_shifted
602+
# difference in function values y_i = F(x[i])
603+
diff_ys = function_values - function_values.shift(**{threshold_dim: 1})
604+
# gradients m
605+
m_values = diff_ys / diff_xs
606+
# intercepts b_i
607+
b_values = function_values - m_values * x_values
608+
# integral(t * (1 - F(t))) on the interval (x[i-1], x[i]), for each i, using calculus:
609+
integral_i = (1 - b_values) * (x_values**2 - x_shifted**2) / 2 - m_values * (x_values**3 - x_shifted**3) / 3
610+
611+
integral = integral_i.sum(threshold_dim)
612+
# return NaN if NaN in function_values
613+
integral = integral.where(~np.isnan(function_values).any(threshold_dim))
614+
615+
return integral

0 commit comments

Comments
 (0)