Skip to content

Commit 0f68008

Browse files
authored
Generalized triangular distribution to arbitrary percentiles (#21)
* generalized triangular * simplified triangular code - no need to fit mode
1 parent bf37bec commit 0f68008

File tree

2 files changed

+105
-115
lines changed

2 files changed

+105
-115
lines changed

src/probabilit/distributions.py

Lines changed: 61 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import scipy as sp
21
import numpy as np
32
import warnings
3+
import scipy as sp
44
from probabilit.modeling import Distribution
55

66

@@ -27,129 +27,87 @@ def PERT(minimum, mode, maximum, gamma=4.0):
2727
return Distribution("beta", a=a, b=b, loc=loc, scale=scale)
2828

2929

30-
def Triangular(p10, mode, p90):
31-
"""Find optimal scipy parametrization given (p10, mode, p90) and
30+
def Triangular(low, mode, high, low_perc=0.1, high_perc=0.9):
31+
"""Find optimal scipy parametrization given (low, mode, high) and
3232
return Distribution("triang", loc=..., scale=..., c=...).
3333
3434
This distribution does *not* work with composite distributions.
3535
The arguments must be numbers, they cannot be other distributions.
3636
3737
Examples
3838
--------
39-
>>> Triangular(p10=1, mode=5, p90=9)
40-
Distribution("triang", loc=-2.236068061140598, scale=14.472136057963969, c=0.5000000024295282)
39+
>>> Triangular(low=1, mode=5, high=9)
40+
Distribution("triang", loc=-2.2360679774997894, scale=14.472135954999578, c=0.5000000000000001)
41+
>>> Triangular(low=1, mode=5, high=9, low_perc=0.25, high_perc=0.75)
42+
Distribution("triang", loc=-8.656854249492383, scale=27.313708498984766, c=0.5)
4143
"""
4244
# A few comments on fitting can be found here:
4345
# https://docs.analytica.com/index.php/Triangular10_50_90
4446

45-
if not (p10 < mode < p90):
46-
raise ValueError(f"Must have {p10=} < {mode=} < {p90=}")
47+
if not (low < mode < high):
48+
raise ValueError(f"Must have {low=} < {mode=} < {high=}")
49+
if not ((0 < low_perc <= 1.0) and (0 <= high_perc < 1.0)):
50+
raise ValueError("Percentiles must be between 0 and 1.")
4751

4852
# Optimize parameters
49-
loc, scale, c = _triang_params_from_perc(p10=p10, mode=mode, p90=p90)
50-
return Distribution("triang", loc=loc, scale=scale, c=c)
51-
52-
53-
def _triang_params_from_perc(p10, mode, p90):
54-
"""Given (p10, mode, p90), finds (shift, scale, c).
55-
56-
Examples
57-
--------
58-
>>> from scipy.stats import triang
59-
>>> import math
60-
>>> dist = triang(loc=-5, scale=13, c=0.85)
61-
>>> loc, scale, c = _triang_params_from_perc(*_triang_extract(dist))
62-
>>> math.isclose(loc, -5, rel_tol=0.001)
63-
True
64-
>>> math.isclose(scale, 13, rel_tol=0.001)
65-
True
66-
>>> math.isclose(c, 0.85, rel_tol=0.001)
67-
True
68-
"""
69-
assert p10 < mode < p90
70-
71-
# Shift and scale inputs before solving optimization problem
72-
spread = p90 - p10
73-
center = (p90 + p10) / 2
74-
p10 = (p10 - center) / spread
75-
mode = (mode - center) / spread
76-
p90 = (p90 - center) / spread
77-
78-
# Given (p10, mode, p90) we need to find a scipy parametrization
79-
# in terms of (loc, scale, c). This cannot be solved analytically.
80-
desired = np.array([p10, mode, p90])
81-
82-
# Initial guess
83-
loc_initial = p10
84-
scale_initial = np.log(p90 - p10)
85-
c_initial = sp.special.logit((mode - p10) / (p90 - p10))
86-
x0 = np.array([loc_initial, scale_initial, c_initial])
87-
88-
# Optimize
89-
result = sp.optimize.minimize(
90-
_triang_objective, x0=x0, args=(desired,), method="BFGS"
53+
loc, scale, c = _fit_triangular_distribution(
54+
low=low,
55+
mode=mode,
56+
high=high,
57+
low_perc=low_perc,
58+
high_perc=high_perc,
9159
)
92-
93-
assert result.fun < 1e-2
94-
# Issues can arise. Determining this beforehand
95-
# is hard, so we simply try to optimize and see if we get close.
96-
if result.fun > 1e-4:
97-
warnings.warn(f"Optimization of triangular params did not converge:\n{result}")
98-
99-
# Extract parameters
100-
loc_opt = result.x[0]
101-
scale_opt = np.exp(result.x[1])
102-
c_opt = sp.special.expit(result.x[2])
103-
104-
# Shift and scale problem back
105-
loc_opt = loc_opt * spread + center
106-
scale_opt = scale_opt * spread
107-
108-
return float(loc_opt), float(scale_opt), float(c_opt)
60+
return Distribution("triang", loc=loc, scale=scale, c=c)
10961

11062

111-
def _triang_extract(triangular):
112-
"""Given a triangular distribution, extract (p10, mode, p90).
63+
def _fit_triangular_distribution(low, mode, high, low_perc=0.10, high_perc=0.90):
64+
"""Returns a tuple (loc, scale, c) to be used with scipy.
11365
11466
Examples
11567
--------
116-
>>> from scipy.stats import triang
117-
>>> dist = triang(loc=-5, scale=13, c=0.6)
118-
>>> p10, mode, p90 = _triang_extract(dist)
119-
>>> mode
120-
2.8
121-
>>> p90
122-
5.4
68+
>>> _fit_triangular_distribution(3, 8, 10, low_perc=0.10, high_perc=0.90)
69+
(-0.207..., 12.53..., 0.65...)
70+
>>> _fit_triangular_distribution(3, 8, 10, low_perc=0.4, high_perc=0.6)
71+
(-27.63..., 65.82..., 0.54...)
72+
>>> _fit_triangular_distribution(3, 8, 10, low_perc=0, high_perc=1.0)
73+
(3.00..., 6.99..., 0.71...)
12374
"""
124-
p10, p90 = triangular.ppf([0.1, 0.9])
125-
loc = triangular.kwds.get("loc", 0)
126-
scale = triangular.kwds.get("scale", 1)
127-
c = triangular.kwds.get("c", 0.5)
128-
mode = loc + scale * c
129-
130-
return float(p10), float(mode), float(p90)
131-
132-
133-
def _triang_objective(parameters, desired):
134-
"""Pass parameters (loc, log(scale), logit(c)) into sp.stats.triang
135-
and return the RMSE between actual and desired (p10, mode, p90)."""
136-
137-
loc, scale, c = parameters
138-
scale = np.exp(scale) # Scale must be positive
139-
c = np.clip(sp.special.expit(c), 0, 1) # C must be between 0 and 1
140-
141-
# Create distribution
142-
triangular = sp.stats.triang(loc=loc, scale=scale, c=c)
143-
144-
# Extract information
145-
p10, mode, p90 = _triang_extract(triangular)
146-
actual = np.array([p10, mode, p90])
147-
148-
if not np.isfinite(actual).all():
149-
return 1e3
15075

151-
# RMSE
152-
return np.sqrt(np.sum((desired - actual) ** 2)) / scale
76+
def triangular_cdf(x, a, b, mode):
77+
"""Calculate CDF of triangular distribution at point x"""
78+
if x <= a:
79+
return x * 0
80+
if x >= b:
81+
return x * 0 + 1.0
82+
if x <= mode:
83+
return ((x - a) ** 2) / ((b - a) * (mode - a))
84+
else:
85+
return 1 - ((b - x) ** 2) / ((b - a) * (b - mode))
86+
87+
def equations(params):
88+
"""System of equations to solve for a and b"""
89+
a, b = params
90+
91+
# Calculate CDFs at the given percentile values
92+
cdf_low = triangular_cdf(low, a, b, mode)
93+
cdf_high = triangular_cdf(high, a, b, mode)
94+
95+
# Return the difference from target percentiles
96+
return (cdf_low - low_perc, cdf_high - high_perc)
97+
98+
# Initial guesses for a and b, the lower and upper bounds for support
99+
a0 = low - abs(mode - low)
100+
b0 = high + abs(high - mode)
101+
102+
# Solve the system of equations
103+
a, b = sp.optimize.fsolve(equations, (a0, b0))
104+
rmse = np.sqrt(np.sum(np.array(equations([a, b])) ** 2))
105+
if rmse > 1e-6:
106+
warnings.warn(f"Optimization of Triangular params has {rmse=}")
107+
108+
# Calculate the relative position of the mode, return (loc, scale, c)
109+
c = (mode - a) / (b - a)
110+
return float(a), float(b - a), float(c)
153111

154112

155113
def _pert_to_beta(minimum, mode, maximum, gamma=4.0):

tests/test_distributions.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from probabilit.distributions import (
2-
_triang_params_from_perc,
3-
_triang_extract,
2+
_fit_triangular_distribution,
43
_pert_to_beta,
54
)
65
import pytest
@@ -9,18 +8,51 @@
98

109

1110
class TestTriangular:
12-
@pytest.mark.parametrize("c", np.linspace(0.5, 0.95, num=7))
13-
@pytest.mark.parametrize("scale", [1, 10, 100, 1000])
14-
def test_triang_params_from_perc(self, c, scale):
11+
@pytest.mark.parametrize("c", [0.1, 0.5, 0.7])
12+
@pytest.mark.parametrize("loc", [-1, 0, 1])
13+
@pytest.mark.parametrize("scale", [1, 10, 25])
14+
@pytest.mark.parametrize("low_perc", [0.01, 0.05, 0.1, 0.2])
15+
def test_triangular_roundstrips(self, c, loc, scale, low_perc):
1516
# Test round-trips
17+
mode = loc + c * scale
18+
high_perc = 0.8
19+
20+
# Get parameters to optimize toward
21+
distr = triang(loc=loc, scale=scale, c=c)
22+
target_low, target_high = distr.ppf([low_perc, high_perc])
23+
24+
# Found parameters
25+
loc_f, scale_f, c_f = _fit_triangular_distribution(
26+
mode=mode,
27+
low=target_low,
28+
high=target_high,
29+
low_perc=low_perc,
30+
high_perc=high_perc,
31+
)
32+
33+
np.testing.assert_allclose([loc_f, scale_f, c_f], [loc, scale, c], atol=1e-8)
34+
35+
@pytest.mark.parametrize("delta", [0.001, 0.01, 0.1, 0.2, 0.3, 0.4])
36+
def test_triangular_roundstrips_squeeze(self, delta):
1637
loc = 0
17-
initial = np.array([loc, scale, c])
18-
dist = triang(loc=loc, scale=scale, c=c)
19-
p10, mode, p90 = _triang_extract(dist)
20-
if (p10 < mode - 0.01) and (p90 > mode + 0.01):
21-
loc, scale, c = _triang_params_from_perc(p10, mode, p90)
22-
final = np.array([loc, scale, c])
23-
np.testing.assert_allclose(final, initial, atol=1e-3)
38+
scale = 10
39+
c = 0.8
40+
mode = loc + c * scale
41+
42+
# Get parameters to optimize toward
43+
distr = triang(loc=loc, scale=scale, c=c)
44+
target_low, target_high = distr.ppf([delta, 1 - delta])
45+
46+
# Found parameters
47+
loc_f, scale_f, c_f = _fit_triangular_distribution(
48+
mode=mode,
49+
low=target_low,
50+
high=target_high,
51+
low_perc=delta,
52+
high_perc=1 - delta,
53+
)
54+
55+
np.testing.assert_allclose([loc_f, scale_f, c_f], [loc, scale, c], atol=1e-8)
2456

2557

2658
class TestPERT:

0 commit comments

Comments
 (0)