Skip to content

Commit af1d1c8

Browse files
committed
probabilistic missing values first try
1 parent a1b5c8e commit af1d1c8

File tree

2 files changed

+452
-0
lines changed

2 files changed

+452
-0
lines changed

orgipy/preprocessing/imputation.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,265 @@ def impute_gaussian(
5252
X[np.invert(zero_mask)] = imputed_values
5353
if not inplace:
5454
return X
55+
56+
57+
# ProDA
58+
59+
# import numpy as np
60+
# from scipy.optimize import minimize, Bounds
61+
# from scipy.stats import norm, invgamma, t
62+
63+
64+
# def pd_lm(
65+
# y,
66+
# dropout_curve_position,
67+
# dropout_curve_scale,
68+
# location_prior_mean,
69+
# location_prior_scale,
70+
# variance_prior_scale,
71+
# variance_prior_df,
72+
# location_prior_df=3,
73+
# verbose=False,
74+
# ):
75+
# """
76+
# Fit a probabilistic dropout model with location and variance priors.
77+
78+
# Args:
79+
# y: numpy array of observations (can contain NaN)
80+
# dropout_curve_position: float, position parameter for dropout curve
81+
# dropout_curve_scale: float, scale parameter for dropout curve
82+
# location_prior_mean: float, prior mean
83+
# location_prior_scale: float, prior scale
84+
# variance_prior_scale: float, prior scale for variance
85+
# variance_prior_df: float, prior degrees of freedom for variance
86+
# location_prior_df: float, degrees of freedom for location prior (default: 3)
87+
# verbose: bool, whether to print messages
88+
# """
89+
# # Setup
90+
# y = np.asarray(y)
91+
# is_missing = np.isnan(y)
92+
# yo = y[~is_missing]
93+
# n_obs = len(yo)
94+
# n_total = len(y)
95+
96+
# # Initial values
97+
# beta_init = location_prior_mean
98+
# sigma2_init = variance_prior_df * variance_prior_scale / (variance_prior_df + 2)
99+
100+
# def dropout_probability(mu):
101+
# """Calculate dropout probability using normal CDF"""
102+
# return norm.cdf((mu - dropout_curve_position) / dropout_curve_scale)
103+
104+
# def objective(params):
105+
# """Negative log likelihood with priors and dropout"""
106+
# beta, sigma2 = params
107+
# if sigma2 <= 0:
108+
# return np.inf
109+
110+
# # Log likelihood for observed values
111+
# ll_obs = np.sum(norm.logpdf(yo, loc=beta, scale=np.sqrt(sigma2)))
112+
113+
# # Dropout likelihood
114+
# print("dropout")
115+
# print(beta)
116+
# print(dropout_probability(beta))
117+
# p_dropout = dropout_probability(beta)
118+
# ll_dropout = np.sum(np.log(p_dropout) * is_missing)
119+
120+
# # Location prior (Student's t)
121+
# ll_loc = np.sum(
122+
# t.logpdf(
123+
# np.repeat(beta, n_total),
124+
# df=location_prior_df,
125+
# loc=location_prior_mean,
126+
# scale=np.sqrt(location_prior_scale),
127+
# )
128+
# )
129+
# # ll_loc = -0.5 * ((beta - location_prior_mean) ** 2) / location_prior_scale**2
130+
131+
# # Variance prior (Inverse chi-squared)
132+
# ll_var = invgamma.logpdf(
133+
# sigma2,
134+
# a=variance_prior_df / 2,
135+
# scale=variance_prior_scale * variance_prior_df / 2,
136+
# ) + np.log(sigma2)
137+
# # ll_var = -(variance_prior_df + 2) * 0.5 * np.log(
138+
# # sigma2
139+
# # ) - variance_prior_df * variance_prior_scale / (2 * sigma2)
140+
141+
# print(ll_obs, ll_dropout, ll_loc, ll_var)
142+
# return -(ll_obs + ll_dropout + ll_loc + ll_var)
143+
144+
# def gradient(params):
145+
# """Analytic gradient of negative log likelihood"""
146+
147+
# # Unpack parameters
148+
# beta, sigma2 = params
149+
# sign(sd) * exp(dnorm(x, mu, abs(sd), log=TRUE) - invprobit(x, mu, sd, log=TRUE))
150+
# x = (beta - dropout_curve_position) / dropout_curve_scale
151+
# imr = norm.logpdf(x) - np.log(1 - norm.cdf(x))
152+
# print("imr:", imr)
153+
# # Calculate beta gradients
154+
# dbeta_p = (
155+
# -(location_prior_df + 1)
156+
# * (beta - location_prior_mean)
157+
# / (
158+
# location_prior_df * location_prior_scale
159+
# + (beta - location_prior_mean) ** 2
160+
# )
161+
# )
162+
163+
# dbeta_o = -np.sum(beta - yo) / sigma2
164+
# dbeta_m = np.sum(imr)
165+
# print(dbeta_p, dbeta_o, dbeta_m)
166+
167+
# # Calculate sigma2 gradients
168+
# dsig2_p = (
169+
# -(1 + variance_prior_df / 2) / sigma2
170+
# + variance_prior_df * variance_prior_scale / (2 * sigma2**2)
171+
# + 1 / sigma2
172+
# )
173+
174+
# dsig2_o = np.sum((beta - yo) ** 2 - sigma2) / (2 * sigma2**2)
175+
# dsig2_m = -np.sum(
176+
# (beta - dropout_curve_position) / (2 * dropout_curve_scale**2) * imr
177+
# )
178+
179+
# print("grad")
180+
# print(dbeta_p, dbeta_o, dbeta_m)
181+
# print(dsig2_p, dsig2_o, dsig2_m)
182+
# return np.array([dbeta_p + dbeta_o + dbeta_m, dsig2_p + dsig2_o + dsig2_m])
183+
184+
# def hessian(params):
185+
# """Analytic Hessian of negative log likelihood"""
186+
# beta, sigma2 = params
187+
# if sigma2 <= 0:
188+
# return np.array([[np.inf, np.inf], [np.inf, np.inf]])
189+
190+
# # Compute dropout-related terms
191+
# p_dropout = dropout_probability(beta)
192+
# z = (beta - dropout_curve_position) / dropout_curve_scale
193+
# pdf_z = norm.pdf(z)
194+
195+
# # Second derivatives for dropout
196+
# d2_beta_dropout = (
197+
# -1
198+
# / dropout_curve_scale**2
199+
# * np.sum(
200+
# (is_missing / p_dropout - (~is_missing) / (1 - p_dropout))
201+
# * (-z * pdf_z)
202+
# + (is_missing / p_dropout**2 + (~is_missing) / (1 - p_dropout) ** 2)
203+
# * pdf_z**2
204+
# )
205+
# )
206+
207+
# # Complete second derivatives
208+
# d2_beta2 = -n_obs / sigma2 + d2_beta_dropout - 1 / location_prior_scale**2
209+
210+
# d2_sigma2 = (
211+
# -np.sum((yo - beta) ** 2) / sigma2**3
212+
# + 0.5 * n_obs / sigma2**2
213+
# + (variance_prior_df + 2) * 0.5 / sigma2**2
214+
# - variance_prior_df * variance_prior_scale / sigma2**3
215+
# )
216+
217+
# d2_beta_sigma2 = -np.sum(yo - beta) / sigma2**2
218+
219+
# H = np.array([[d2_beta2, d2_beta_sigma2], [d2_beta_sigma2, d2_sigma2]])
220+
221+
# return -H
222+
223+
# # Optimize with analytic derivatives
224+
# bounds = Bounds([0, 1e-10], [np.inf, np.inf])
225+
# result = minimize(
226+
# objective,
227+
# x0=[beta_init, sigma2_init],
228+
# method="BFGS",
229+
# jac=gradient,
230+
# # hess=hessian,
231+
# bounds=bounds,
232+
# )
233+
# # result = minimize(
234+
# # objective,
235+
# # x0=[beta_init, sigma2_init],
236+
# # method="L-BFGS-B",
237+
# # # jac=gradient,
238+
# # # hess=hessian,
239+
# # bounds=bounds,
240+
# # )
241+
242+
# if not result.success and verbose:
243+
# print(f"Warning: Optimization did not converge: {result.message}")
244+
245+
# print(objective(result.x))
246+
# # Extract results
247+
# fit_beta, fit_sigma2 = result.x
248+
249+
# # Get variance-covariance matrix from Hessian
250+
# try:
251+
# Var_coef = np.linalg.inv(hessian(result.x))
252+
# except np.linalg.LinAlgError:
253+
# if verbose:
254+
# print("Warning: Could not invert Hessian")
255+
# Var_coef = np.diag([np.inf, np.inf])
256+
257+
# # Calculate effective sample size and degrees of freedom
258+
# fit_sigma2_var = Var_coef[1, 1]
259+
# n_approx = 2 * fit_sigma2**2 / fit_sigma2_var - variance_prior_df
260+
# df_approx = n_approx - 1 + variance_prior_df
261+
262+
# # Calculate unbiased variance estimate
263+
# if df_approx > 30 * len(y) and df_approx > 100:
264+
# df_approx = np.inf
265+
# s2_approx = variance_prior_scale
266+
# else:
267+
# s2_approx = fit_sigma2 * n_approx / (n_approx - 1)
268+
269+
# # Calculate correction factors for variance-covariance matrix
270+
# try:
271+
# var_beta = Var_coef[0, 0]
272+
# correction = np.sqrt(8 * var_beta)
273+
# Var_coef_corrected = correction * Var_coef * correction
274+
# except:
275+
# if verbose:
276+
# print("Warning: Could not compute correction factors")
277+
# Var_coef_corrected = Var_coef
278+
279+
# return {
280+
# "coefficients": fit_beta,
281+
# "sigma2": fit_sigma2,
282+
# "coef_variance_matrix": Var_coef_corrected,
283+
# "n_approx": n_approx,
284+
# "df": df_approx,
285+
# "s2": s2_approx,
286+
# "n_obs": n_obs,
287+
# "dropout_prob": dropout_probability(fit_beta),
288+
# }
289+
290+
291+
# # Example usage:
292+
# if __name__ == "__main__":
293+
# # Generate example data
294+
# np.random.seed(42)
295+
# y = np.random.normal(loc=10, scale=2, size=20)
296+
# y[np.random.rand(20) < 0.3] = np.nan # add missing values
297+
298+
# # Fit model
299+
# result = pd_lm(
300+
# y=y,
301+
# dropout_curve_position=8,
302+
# dropout_curve_scale=-1,
303+
# location_prior_mean=9,
304+
# location_prior_scale=3,
305+
# variance_prior_scale=1,
306+
# variance_prior_df=2,
307+
# verbose=True,
308+
# )
309+
310+
# print("\nResults:")
311+
# print(f"Estimated mean: {result['coefficients']:.3f}")
312+
# print(f"Estimated variance: {result['sigma2']:.3f}")
313+
# print(f"Standard error: {np.sqrt(result['coef_variance_matrix'][0,0]):.3f}")
314+
# print(f"Effective sample size: {result['n_approx']:.1f}")
315+
# print(f"Degrees of freedom: {result['df']:.1f}")
316+
# print(f"Dropout probability: {result['dropout_prob']:.3f}")

0 commit comments

Comments
 (0)