@@ -52,3 +52,265 @@ def impute_gaussian(
5252 X [np .invert (zero_mask )] = imputed_values
5353 if not inplace :
5454 return X
55+
56+
57+ # ProDA
58+
59+ # import numpy as np
60+ # from scipy.optimize import minimize, Bounds
61+ # from scipy.stats import norm, invgamma, t
62+
63+
64+ # def pd_lm(
65+ # y,
66+ # dropout_curve_position,
67+ # dropout_curve_scale,
68+ # location_prior_mean,
69+ # location_prior_scale,
70+ # variance_prior_scale,
71+ # variance_prior_df,
72+ # location_prior_df=3,
73+ # verbose=False,
74+ # ):
75+ # """
76+ # Fit a probabilistic dropout model with location and variance priors.
77+
78+ # Args:
79+ # y: numpy array of observations (can contain NaN)
80+ # dropout_curve_position: float, position parameter for dropout curve
81+ # dropout_curve_scale: float, scale parameter for dropout curve
82+ # location_prior_mean: float, prior mean
83+ # location_prior_scale: float, prior scale
84+ # variance_prior_scale: float, prior scale for variance
85+ # variance_prior_df: float, prior degrees of freedom for variance
86+ # location_prior_df: float, degrees of freedom for location prior (default: 3)
87+ # verbose: bool, whether to print messages
88+ # """
89+ # # Setup
90+ # y = np.asarray(y)
91+ # is_missing = np.isnan(y)
92+ # yo = y[~is_missing]
93+ # n_obs = len(yo)
94+ # n_total = len(y)
95+
96+ # # Initial values
97+ # beta_init = location_prior_mean
98+ # sigma2_init = variance_prior_df * variance_prior_scale / (variance_prior_df + 2)
99+
100+ # def dropout_probability(mu):
101+ # """Calculate dropout probability using normal CDF"""
102+ # return norm.cdf((mu - dropout_curve_position) / dropout_curve_scale)
103+
104+ # def objective(params):
105+ # """Negative log likelihood with priors and dropout"""
106+ # beta, sigma2 = params
107+ # if sigma2 <= 0:
108+ # return np.inf
109+
110+ # # Log likelihood for observed values
111+ # ll_obs = np.sum(norm.logpdf(yo, loc=beta, scale=np.sqrt(sigma2)))
112+
113+ # # Dropout likelihood
114+ # print("dropout")
115+ # print(beta)
116+ # print(dropout_probability(beta))
117+ # p_dropout = dropout_probability(beta)
118+ # ll_dropout = np.sum(np.log(p_dropout) * is_missing)
119+
120+ # # Location prior (Student's t)
121+ # ll_loc = np.sum(
122+ # t.logpdf(
123+ # np.repeat(beta, n_total),
124+ # df=location_prior_df,
125+ # loc=location_prior_mean,
126+ # scale=np.sqrt(location_prior_scale),
127+ # )
128+ # )
129+ # # ll_loc = -0.5 * ((beta - location_prior_mean) ** 2) / location_prior_scale**2
130+
131+ # # Variance prior (Inverse chi-squared)
132+ # ll_var = invgamma.logpdf(
133+ # sigma2,
134+ # a=variance_prior_df / 2,
135+ # scale=variance_prior_scale * variance_prior_df / 2,
136+ # ) + np.log(sigma2)
137+ # # ll_var = -(variance_prior_df + 2) * 0.5 * np.log(
138+ # # sigma2
139+ # # ) - variance_prior_df * variance_prior_scale / (2 * sigma2)
140+
141+ # print(ll_obs, ll_dropout, ll_loc, ll_var)
142+ # return -(ll_obs + ll_dropout + ll_loc + ll_var)
143+
144+ # def gradient(params):
145+ # """Analytic gradient of negative log likelihood"""
146+
147+ # # Unpack parameters
148+ # beta, sigma2 = params
149+ # sign(sd) * exp(dnorm(x, mu, abs(sd), log=TRUE) - invprobit(x, mu, sd, log=TRUE))
150+ # x = (beta - dropout_curve_position) / dropout_curve_scale
151+ # imr = norm.logpdf(x) - np.log(1 - norm.cdf(x))
152+ # print("imr:", imr)
153+ # # Calculate beta gradients
154+ # dbeta_p = (
155+ # -(location_prior_df + 1)
156+ # * (beta - location_prior_mean)
157+ # / (
158+ # location_prior_df * location_prior_scale
159+ # + (beta - location_prior_mean) ** 2
160+ # )
161+ # )
162+
163+ # dbeta_o = -np.sum(beta - yo) / sigma2
164+ # dbeta_m = np.sum(imr)
165+ # print(dbeta_p, dbeta_o, dbeta_m)
166+
167+ # # Calculate sigma2 gradients
168+ # dsig2_p = (
169+ # -(1 + variance_prior_df / 2) / sigma2
170+ # + variance_prior_df * variance_prior_scale / (2 * sigma2**2)
171+ # + 1 / sigma2
172+ # )
173+
174+ # dsig2_o = np.sum((beta - yo) ** 2 - sigma2) / (2 * sigma2**2)
175+ # dsig2_m = -np.sum(
176+ # (beta - dropout_curve_position) / (2 * dropout_curve_scale**2) * imr
177+ # )
178+
179+ # print("grad")
180+ # print(dbeta_p, dbeta_o, dbeta_m)
181+ # print(dsig2_p, dsig2_o, dsig2_m)
182+ # return np.array([dbeta_p + dbeta_o + dbeta_m, dsig2_p + dsig2_o + dsig2_m])
183+
184+ # def hessian(params):
185+ # """Analytic Hessian of negative log likelihood"""
186+ # beta, sigma2 = params
187+ # if sigma2 <= 0:
188+ # return np.array([[np.inf, np.inf], [np.inf, np.inf]])
189+
190+ # # Compute dropout-related terms
191+ # p_dropout = dropout_probability(beta)
192+ # z = (beta - dropout_curve_position) / dropout_curve_scale
193+ # pdf_z = norm.pdf(z)
194+
195+ # # Second derivatives for dropout
196+ # d2_beta_dropout = (
197+ # -1
198+ # / dropout_curve_scale**2
199+ # * np.sum(
200+ # (is_missing / p_dropout - (~is_missing) / (1 - p_dropout))
201+ # * (-z * pdf_z)
202+ # + (is_missing / p_dropout**2 + (~is_missing) / (1 - p_dropout) ** 2)
203+ # * pdf_z**2
204+ # )
205+ # )
206+
207+ # # Complete second derivatives
208+ # d2_beta2 = -n_obs / sigma2 + d2_beta_dropout - 1 / location_prior_scale**2
209+
210+ # d2_sigma2 = (
211+ # -np.sum((yo - beta) ** 2) / sigma2**3
212+ # + 0.5 * n_obs / sigma2**2
213+ # + (variance_prior_df + 2) * 0.5 / sigma2**2
214+ # - variance_prior_df * variance_prior_scale / sigma2**3
215+ # )
216+
217+ # d2_beta_sigma2 = -np.sum(yo - beta) / sigma2**2
218+
219+ # H = np.array([[d2_beta2, d2_beta_sigma2], [d2_beta_sigma2, d2_sigma2]])
220+
221+ # return -H
222+
223+ # # Optimize with analytic derivatives
224+ # bounds = Bounds([0, 1e-10], [np.inf, np.inf])
225+ # result = minimize(
226+ # objective,
227+ # x0=[beta_init, sigma2_init],
228+ # method="BFGS",
229+ # jac=gradient,
230+ # # hess=hessian,
231+ # bounds=bounds,
232+ # )
233+ # # result = minimize(
234+ # # objective,
235+ # # x0=[beta_init, sigma2_init],
236+ # # method="L-BFGS-B",
237+ # # # jac=gradient,
238+ # # # hess=hessian,
239+ # # bounds=bounds,
240+ # # )
241+
242+ # if not result.success and verbose:
243+ # print(f"Warning: Optimization did not converge: {result.message}")
244+
245+ # print(objective(result.x))
246+ # # Extract results
247+ # fit_beta, fit_sigma2 = result.x
248+
249+ # # Get variance-covariance matrix from Hessian
250+ # try:
251+ # Var_coef = np.linalg.inv(hessian(result.x))
252+ # except np.linalg.LinAlgError:
253+ # if verbose:
254+ # print("Warning: Could not invert Hessian")
255+ # Var_coef = np.diag([np.inf, np.inf])
256+
257+ # # Calculate effective sample size and degrees of freedom
258+ # fit_sigma2_var = Var_coef[1, 1]
259+ # n_approx = 2 * fit_sigma2**2 / fit_sigma2_var - variance_prior_df
260+ # df_approx = n_approx - 1 + variance_prior_df
261+
262+ # # Calculate unbiased variance estimate
263+ # if df_approx > 30 * len(y) and df_approx > 100:
264+ # df_approx = np.inf
265+ # s2_approx = variance_prior_scale
266+ # else:
267+ # s2_approx = fit_sigma2 * n_approx / (n_approx - 1)
268+
269+ # # Calculate correction factors for variance-covariance matrix
270+ # try:
271+ # var_beta = Var_coef[0, 0]
272+ # correction = np.sqrt(8 * var_beta)
273+ # Var_coef_corrected = correction * Var_coef * correction
274+ # except:
275+ # if verbose:
276+ # print("Warning: Could not compute correction factors")
277+ # Var_coef_corrected = Var_coef
278+
279+ # return {
280+ # "coefficients": fit_beta,
281+ # "sigma2": fit_sigma2,
282+ # "coef_variance_matrix": Var_coef_corrected,
283+ # "n_approx": n_approx,
284+ # "df": df_approx,
285+ # "s2": s2_approx,
286+ # "n_obs": n_obs,
287+ # "dropout_prob": dropout_probability(fit_beta),
288+ # }
289+
290+
291+ # # Example usage:
292+ # if __name__ == "__main__":
293+ # # Generate example data
294+ # np.random.seed(42)
295+ # y = np.random.normal(loc=10, scale=2, size=20)
296+ # y[np.random.rand(20) < 0.3] = np.nan # add missing values
297+
298+ # # Fit model
299+ # result = pd_lm(
300+ # y=y,
301+ # dropout_curve_position=8,
302+ # dropout_curve_scale=-1,
303+ # location_prior_mean=9,
304+ # location_prior_scale=3,
305+ # variance_prior_scale=1,
306+ # variance_prior_df=2,
307+ # verbose=True,
308+ # )
309+
310+ # print("\nResults:")
311+ # print(f"Estimated mean: {result['coefficients']:.3f}")
312+ # print(f"Estimated variance: {result['sigma2']:.3f}")
313+ # print(f"Standard error: {np.sqrt(result['coef_variance_matrix'][0,0]):.3f}")
314+ # print(f"Effective sample size: {result['n_approx']:.1f}")
315+ # print(f"Degrees of freedom: {result['df']:.1f}")
316+ # print(f"Dropout probability: {result['dropout_prob']:.3f}")
0 commit comments