|
12 | 12 | from scores.probability.checks import (
|
13 | 13 | cdf_values_within_bounds,
|
14 | 14 | check_nan_decreasing_inputs,
|
| 15 | + coords_increasing, |
15 | 16 | )
|
16 | 17 | from scores.typing import XarrayLike
|
17 | 18 |
|
@@ -397,3 +398,218 @@ def cdf_envelope(
|
397 | 398 | result.loc["lower"] = np.where(~np.isnan(cdf), cdf_lower, np.nan)
|
398 | 399 |
|
399 | 400 | return result
|
| 401 | + |
| 402 | + |
| 403 | +def check_cdf( |
| 404 | + cdf: xr.DataArray, |
| 405 | + threshold_dim: str, |
| 406 | + dims: Optional[Iterable[str]], |
| 407 | + varname_cdf: str = "cdf", |
| 408 | + varname_threshold_dim: str = "threshold_dim", |
| 409 | + varname_dims: str = "dims", |
| 410 | +): |
| 411 | + """ |
| 412 | + Performs various checks on a CDF values of a real-valued random variable |
| 413 | + in an array `cdf` and its dimensions. Does not check that `cdf` is nondecreasing |
| 414 | + along the `threshold_dim` dimension; |
| 415 | + use the function `nan_decreasing_cdfs` instead. |
| 416 | +
|
| 417 | + Args: |
| 418 | + cdf: array of CDF values to check. |
| 419 | + threshold_dim: name of the threshold dimension in `cdf`. |
| 420 | + dims: if not `None`, a subset of dimensions of `cdf`, not including |
| 421 | + `threshold_dim`. |
| 422 | + varname_cdf: the variable name for `cdf` used in the error message. |
| 423 | + varname_threshold_dim: the variable name for `threshold_dim` used in the error message. |
| 424 | + varname_dims: the variable name for `dims` used in the error message. |
| 425 | +
|
| 426 | + Raises: |
| 427 | + ValueError: if `threshold_dim` is not a dimension of `cdf`. |
| 428 | + ValueError: if `threshold_dim` is in `dims`. |
| 429 | + ValueError: if `dims` is not a subset of `cdf.dims`. |
| 430 | + ValueError: if `cdf` has non-NaN values outside the closed interval [0,1]. |
| 431 | + ValueError: if `cdf[threshold_dim]` coordinates are not increasing. |
| 432 | + """ |
| 433 | + if threshold_dim not in cdf.dims: |
| 434 | + raise ValueError(f"`{varname_threshold_dim}` is not a dimension of `{varname_cdf}`") |
| 435 | + |
| 436 | + if dims is not None: |
| 437 | + if threshold_dim in dims: |
| 438 | + raise ValueError(f"`{varname_threshold_dim}` is in `{varname_dims}`") |
| 439 | + if not set(dims).issubset(cdf.dims): |
| 440 | + raise ValueError(f"`{varname_dims}` is not a subset of dimensions of `{varname_cdf}`") |
| 441 | + |
| 442 | + if not cdf_values_within_bounds(cdf): |
| 443 | + raise ValueError(f"values of `{varname_cdf}` must be in the closed interval [0,1]") |
| 444 | + |
| 445 | + if not coords_increasing(cdf, threshold_dim): |
| 446 | + raise ValueError(f"coordinates along `{varname_threshold_dim}` of `{varname_cdf}` must be increasing") |
| 447 | + |
| 448 | + |
| 449 | +def check_cdf_support( |
| 450 | + cdf: xr.DataArray, |
| 451 | + threshold_dim: str = "threshold", |
| 452 | + lower_supp: float = 0, |
| 453 | + upper_supp: float = np.inf, |
| 454 | + varname_cdf: str = "cdf", |
| 455 | + varname_threshold_dim: str = "threshold_dim", |
| 456 | +): |
| 457 | + """ |
| 458 | + Checks that the CDF of a real-valued random variable is supported on |
| 459 | + the closed interval [lower_supp, upper_supp], i.e., that |
| 460 | +
|
| 461 | + - cdf(x) = 0 when x < `lower_support` |
| 462 | + - cdf(x) = 1 when x > `upper_support` |
| 463 | +
|
| 464 | + for x in `cdf[threshold_dim].values`. |
| 465 | +
|
| 466 | + NaNs are ignored. Tests for "equality" uses `np.allclose`. |
| 467 | +
|
| 468 | + Args: |
| 469 | + cdf: array of CDF values, with threshold dimension `threshold_dim`. |
| 470 | + threshold_dim: name of the threshold dimension in `cdf`. |
| 471 | + lower_supp: left endpoint of the interval of support. Can be `-np.inf`. |
| 472 | + upper_supp: right endpoint of the interval of support. Can be `np.inf`. |
| 473 | + varname_cdf: the variable name for `cdf` used in the error message. |
| 474 | + varname_threshold_dim: the variable name for `cdf` used in the error message. |
| 475 | +
|
| 476 | + Returns: |
| 477 | + True if all checks pass. |
| 478 | +
|
| 479 | + Raises: |
| 480 | + ValueError: if `lower_support` > `upper_support`. |
| 481 | + ValueError: if cdf(x) != 0 when x < `lower_support`. |
| 482 | + ValueError: if cdf(x) != 1 when x > `upper_support`. |
| 483 | + """ |
| 484 | + if upper_supp < lower_supp: |
| 485 | + raise ValueError("`upper_supp < lower_supp`") |
| 486 | + |
| 487 | + upper = cdf.where(cdf[threshold_dim] > upper_supp).values |
| 488 | + lower = cdf.where(cdf[threshold_dim] < lower_supp).values |
| 489 | + |
| 490 | + if not np.allclose(upper[~np.isnan(upper)], 1): |
| 491 | + raise ValueError(f"`{varname_cdf}` is not 1 when `{varname_cdf}[{varname_threshold_dim}] > {upper_supp}`") |
| 492 | + |
| 493 | + if not np.allclose(lower[~np.isnan(lower)], 0): |
| 494 | + raise ValueError(f"`{varname_cdf}` is not 0 when `{varname_cdf}[{varname_threshold_dim}] < {lower_supp}`") |
| 495 | + |
| 496 | + |
| 497 | +def expectedvalue_from_cdf( |
| 498 | + cdf: xr.DataArray, |
| 499 | + threshold_dim: str = "threshold", |
| 500 | + nonnegative_support: bool = True, |
| 501 | +) -> xr.DataArray: |
| 502 | + """ |
| 503 | + Returns the expected value E(Y) of a real-valued random variable Y with |
| 504 | + distribution given by `cdf`. |
| 505 | +
|
| 506 | + The current implementation assumes that Y >= 0, i.e., the CDF has nonnnegative support. |
| 507 | + This assumption may be relaxed in a future implementation. |
| 508 | +
|
| 509 | + The calculation is based on the formula |
| 510 | +
|
| 511 | + E(Y) = integral(1 - cdf(x), x >=0), |
| 512 | +
|
| 513 | + where it is assumed that the CDF is piecewise linear between CDF values given by the |
| 514 | + array `cdf` when x >= 0. |
| 515 | +
|
| 516 | + Args: |
| 517 | + cdf: array of CDF values, with threshold dimension `threshold_dim`. |
| 518 | + threshold_dim: name of the threshold dimension in `cdf`. |
| 519 | + nonnegative_support: `True` if the support of the probability measure |
| 520 | + described by `cdf` is contained in the half line [0, inf). |
| 521 | +
|
| 522 | + Returns: |
| 523 | + xr.DataArray containing the expected value of each CDF in `cdf`, with |
| 524 | + `threshold_dim` collapsed. |
| 525 | +
|
| 526 | + Raises: |
| 527 | + ValueError: if `nonnegative_support = False`. |
| 528 | + ValueError: if `threshold_dim` is not a dimension of `cdf`. |
| 529 | + ValueError: if `cdf` has non-NaN values outside the closed interval [0,1]. |
| 530 | + ValueError: if `cdf[threshold_dim]` coordinates are not increasing. |
| 531 | + ValueError: if `cdf` is not 0 when `cdf[threshold_dim] < 0` and |
| 532 | + `nonnegative_support = True`. |
| 533 | + """ |
| 534 | + if nonnegative_support: |
| 535 | + check_cdf_support(cdf, threshold_dim, 0, np.inf) |
| 536 | + else: |
| 537 | + raise ValueError("This function currently only handles the case when the cdf has nonnegative support") |
| 538 | + |
| 539 | + check_cdf(cdf, threshold_dim, None, "cdf", "threshold_dim", "dims") |
| 540 | + |
| 541 | + # only select CDF values where 0 <= threshold_dim. |
| 542 | + integrand = 1 - cdf.sel({threshold_dim: slice(0, np.inf)}) |
| 543 | + |
| 544 | + return integrand.integrate(threshold_dim) |
| 545 | + |
| 546 | + |
| 547 | +def variance_from_cdf(cdf: xr.DataArray, threshold_dim: str = "threshold") -> xr.DataArray: |
| 548 | + """ |
| 549 | + Returns the variance Var(Y) of a random variable Y with distribution given by `cdf`, |
| 550 | + under the assumption that Y >= 0. The calculation is based on the formula |
| 551 | +
|
| 552 | + Var(Y) = 2 * integral(x * (1 - cdf(x)), x >= 0) + E(Y)^2, |
| 553 | +
|
| 554 | + where it is assumed that the CDF is piecewise linear between CDF values given by the |
| 555 | + array `cdf` when x >= 0. |
| 556 | +
|
| 557 | + Args: |
| 558 | + cdf: array of CDF values, with threshold dimension `threshold_dim`. |
| 559 | + threshold_dim: name of the threshold dimension in `cdf`. |
| 560 | +
|
| 561 | + Returns: |
| 562 | + xr.DataArray containing the variance Var(Y) associated with each CDF in `cdf`, |
| 563 | + with `threshold_dim` collapsed. |
| 564 | +
|
| 565 | + Raises: |
| 566 | + ValueError: if `threshold_dim` is not a dimension of `cdf`. |
| 567 | + ValueError: if `cdf` has non-NaN values outside the closed interval [0,1]. |
| 568 | + ValueError: if `cdf[threshold_dim]` coordinates are not increasing. |
| 569 | + ValueError: if `cdf` is not 0 when `cdf[threshold_dim] < 0`. |
| 570 | + """ |
| 571 | + check_cdf(cdf, threshold_dim, None, "cdf", "threshold_dim", "dims") |
| 572 | + check_cdf_support(cdf, threshold_dim, 0, np.inf) |
| 573 | + |
| 574 | + expected_value = expectedvalue_from_cdf(cdf, threshold_dim) |
| 575 | + |
| 576 | + # only use values of CDF where threshold >= 0. |
| 577 | + cdf = cdf.sel({threshold_dim: slice(0, np.inf)}) |
| 578 | + integral = _var_from_cdf(cdf, threshold_dim) |
| 579 | + result = 2 * integral - expected_value**2 |
| 580 | + |
| 581 | + return result |
| 582 | + |
| 583 | + |
| 584 | +def _var_from_cdf(function_values, threshold_dim): |
| 585 | + """ |
| 586 | + Calculates the integral |
| 587 | +
|
| 588 | + integral(t * (1 - F(t))) |
| 589 | +
|
| 590 | + where it is assumed that the function F is piecewise linear between values in |
| 591 | + given by the array `function_values`. |
| 592 | +
|
| 593 | + If there is a NaN along `threshold_dim`, then NaN is returned. |
| 594 | + """ |
| 595 | + # notation: F_i(t) = m_i * t + b_i whenever x[i-1] <= t <= x[i] |
| 596 | + # and x[i] is a value in `function_values[threshold_dim]`. |
| 597 | + |
| 598 | + x_values = function_values[threshold_dim] |
| 599 | + x_shifted = function_values[threshold_dim].shift(**{threshold_dim: 1}) |
| 600 | + # difference in x values |
| 601 | + diff_xs = x_values - x_shifted |
| 602 | + # difference in function values y_i = F(x[i]) |
| 603 | + diff_ys = function_values - function_values.shift(**{threshold_dim: 1}) |
| 604 | + # gradients m |
| 605 | + m_values = diff_ys / diff_xs |
| 606 | + # intercepts b_i |
| 607 | + b_values = function_values - m_values * x_values |
| 608 | + # integral(t * (1 - F(t))) on the interval (x[i-1], x[i]), for each i, using calculus: |
| 609 | + integral_i = (1 - b_values) * (x_values**2 - x_shifted**2) / 2 - m_values * (x_values**3 - x_shifted**3) / 3 |
| 610 | + |
| 611 | + integral = integral_i.sum(threshold_dim) |
| 612 | + # return NaN if NaN in function_values |
| 613 | + integral = integral.where(~np.isnan(function_values).any(threshold_dim)) |
| 614 | + |
| 615 | + return integral |
0 commit comments