diff --git a/src/calicost/hmm_NB_BB_nophasing_float.py b/src/calicost/hmm_NB_BB_nophasing_float.py new file mode 100644 index 0000000..f4f6bcf --- /dev/null +++ b/src/calicost/hmm_NB_BB_nophasing_float.py @@ -0,0 +1,403 @@ +import logging +import numpy as np +from numba import njit +from scipy.stats import norm, multivariate_normal, poisson +import scipy.special +from scipy.optimize import minimize +from scipy.optimize import Bounds +from sklearn.mixture import GaussianMixture +from tqdm import trange +import statsmodels.api as sm +from statsmodels.base.model import GenericLikelihoodModel +import copy +from calicost.utils_distribution_fitting import * +from calicost.utils_hmm import * +import networkx as nx + +from scipy.special import gammaln + +try: + import torch + device = 'cuda' if torch.cuda.is_available() else 'cpu' + from calicost.utils_distribution_fitting_gpu import * +except ImportError: + device = 'cpu' + + +""" +Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution. +""" + +def betabinom_logpmf_float(x, n, a, b): + """ + Compute beta-binomial logpmf allowing x and n to be float values. + + Attributes + ---------- + x : np.array + Number of successes. Allowing float number. + n : np.array + Number of trials. Allowing float number. + a : np.array + Shape parameter of the beta distribution. Probability of success * tau, where tau is a transformation of the over-dispersion parameter. + b : np.array + Shape parameter of the beta distribution. Probability of failure * tau, where tau is a transformation of the over-dispersion parameter. + """ + tau = a + b + d_choose_y = gammaln(n + 1) - \ + gammaln(x + 1) - \ + gammaln(n - x + 1) + + logbeta_numer = gammaln(x + a) + \ + gammaln(n - x + b) - \ + gammaln(n + tau) + + logbeta_denom = gammaln(a) + \ + gammaln(b) - \ + gammaln(tau) + + return d_choose_y + logbeta_numer - logbeta_denom + + +############################################################ +# whole inference +############################################################ + +class hmm_nophasing_float(object): + def __init__(self, params="stmp", t=1-1e-4): + """ + Attributes + ---------- + params : str + Codes for parameters that need to be updated. The corresponding parameter can only be updated if it is included in this argument. "s" for start probability; "t" for transition probability; "m" for Negative Binomial RDR signal; "p" for Beta Binomial BAF signal. + + t : float + Determine initial self transition probability to be 1-t. + """ + self.params = params + self.t = t + # + @staticmethod + def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus): + """ + Attributes + ---------- + X : array, shape (n_observations, n_components, n_spots) + Observed expression UMI count and allele frequency UMI count. + + base_nb_mean : array, shape (n_observations, n_spots) + Mean expression under diploid state. + + log_mu : array, shape (n_states, n_spots) + Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot. + + alphas : array, shape (n_states, n_spots) + Over-dispersion of NB distributions in HMM per state per spot. + + total_bb_RD : array, shape (n_observations, n_spots) + SNP-covering reads for both REF and ALT across genes along genome. + + p_binom : array, shape (n_states, n_spots) + BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot. + + taus : array, shape (n_states, n_spots) + Over-dispersion of Beta Binomial distribution in HMM per state per spot. + + Returns + ---------- + log_emission : array, shape (n_states, n_obs, n_spots) + Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. + """ + n_obs = X.shape[0] + n_comp = X.shape[1] + n_spots = X.shape[2] + n_states = log_mu.shape[0] + # initialize log_emission + log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) + log_emission_baf = np.zeros((n_states, n_obs, n_spots)) + for i in np.arange(n_states): + for s in np.arange(n_spots): + # expression from NB distribution + idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + if len(idx_nonzero_rdr) > 0: + nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) + n, p = convert_params(nb_mean, nb_std) + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) + # AF from BetaBinom distribution + idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + if len(idx_nonzero_baf) > 0: + log_emission_baf[i, idx_nonzero_baf, s] = betabinom_logpmf_float(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s]) + return log_emission_rdr, log_emission_baf + # + @staticmethod + def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs): + """ + Attributes + ---------- + X : array, shape (n_observations, n_components, n_spots) + Observed expression UMI count and allele frequency UMI count. + + base_nb_mean : array, shape (n_observations, n_spots) + Mean expression under diploid state. + + log_mu : array, shape (n_states, n_spots) + Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot. + + alphas : array, shape (n_states, n_spots) + Over-dispersion of NB distributions in HMM per state per spot. + + total_bb_RD : array, shape (n_observations, n_spots) + SNP-covering reads for both REF and ALT across genes along genome. + + p_binom : array, shape (n_states, n_spots) + BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot. + + taus : array, shape (n_states, n_spots) + Over-dispersion of Beta Binomial distribution in HMM per state per spot. + + Returns + ---------- + log_emission : array, shape (n_states, n_obs, n_spots) + Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. + """ + n_obs = X.shape[0] + n_comp = X.shape[1] + n_spots = X.shape[2] + n_states = log_mu.shape[0] + # initialize log_emission + log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) + log_emission_baf = np.zeros((n_states, n_obs, n_spots)) + for i in np.arange(n_states): + for s in np.arange(n_spots): + # expression from NB distribution + idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + if len(idx_nonzero_rdr) > 0: + # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s]) + nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s]) + nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) + n, p = convert_params(nb_mean, nb_std) + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) + # AF from BetaBinom distribution + if ("logmu_shift" in kwargs) and ("sample_length" in kwargs): + this_weighted_tp = [] + for c in range(len(kwargs["sample_length"])): + range_s = np.sum(kwargs["sample_length"][:c]) + range_t = np.sum(kwargs["sample_length"][:(c+1)]) + this_weighted_tp.append( tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) / (tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) + 1 - tumor_prop[range_s:range_t,s]) ) + this_weighted_tp = np.concatenate(this_weighted_tp) + else: + this_weighted_tp = tumor_prop[:,s] + idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + if len(idx_nonzero_baf) > 0: + mix_p_A = p_binom[i, s] * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf]) + mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf]) + log_emission_baf[i, idx_nonzero_baf, s] += betabinom_logpmf_float(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s]) + return log_emission_rdr, log_emission_baf + # + @staticmethod + @njit + def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): + ''' + Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. + Input + lengths: sum of lengths = n_observations. + log_transmat: n_states * n_states. Transition probability after log transformation. + log_startprob: n_states. Start probability after log transformation. + log_emission: n_states * n_observations * n_spots. Log probability. + Output + log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda). + ''' + n_obs = log_emission.shape[1] + n_states = log_emission.shape[0] + assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" + assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + # initialize log_alpha + log_alpha = np.zeros((log_emission.shape[0], n_obs)) + buf = np.zeros(log_emission.shape[0]) + cumlen = 0 + for le in lengths: + # start prob + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # But adding too many spots may lead to a higher weight of the emission rather then transition prob. + log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1) + for t in np.arange(1, le): + for j in np.arange(log_emission.shape[0]): + for i in np.arange(log_emission.shape[0]): + buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j] + log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :]) + cumlen += le + return log_alpha + # + @staticmethod + @njit + def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): + ''' + Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. + Input + X: size n_observations * n_components * n_spots. + lengths: sum of lengths = n_observations. + log_transmat: n_states * n_states. Transition probability after log transformation. + log_startprob: n_states. Start probability after log transformation. + log_emission: n_states * n_observations * n_spots. Log probability. + Output + log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda). + ''' + n_obs = log_emission.shape[1] + n_states = log_emission.shape[0] + assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" + assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + # initialize log_beta + log_beta = np.zeros((log_emission.shape[0], n_obs)) + buf = np.zeros(log_emission.shape[0]) + cumlen = 0 + for le in lengths: + # start prob + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # But adding too many spots may lead to a higher weight of the emission rather then transition prob. + log_beta[:, (cumlen + le - 1)] = 0 + for t in np.arange(le-2, -1, -1): + for i in np.arange(log_emission.shape[0]): + for j in np.arange(log_emission.shape[0]): + buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :]) + log_beta[i, (cumlen + t)] = mylogsumexp(buf) + cumlen += le + return log_beta + + # + def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, \ + fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \ + is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs): + ''' + Input + X: size n_observations * n_components * n_spots. + lengths: sum of lengths = n_observations. + base_nb_mean: size of n_observations * n_spots. + In NB-BetaBinom model, n_components = 2 + Intermediate + log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. + alpha: size of n_states. Dispersioon parameter of each HMM state. + ''' + n_obs = X.shape[0] + n_comp = X.shape[1] + n_spots = X.shape[2] + assert n_comp == 2 + # initialize NB logmean shift and BetaBinom prob + log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu + p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom + # initialize (inverse of) dispersion param in NB and BetaBinom + alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus + # initialize start probability and emission probability + log_startprob = np.log( np.ones(n_states) / n_states ) + if n_states > 1: + transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1) + np.fill_diagonal(transmat, self.t) + log_transmat = np.log(transmat) + else: + log_transmat = np.zeros((1,1)) + # initialize log_gamma + log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None + # degenerative case of constructing a list of unique (observed count, expected count) values, and mapping count per bin to the index in the unique list + unique_values_nb = [np.vstack([X[:,0,s], base_nb_mean[:,s]]).T for s in range(n_spots)] + mapping_matrices_nb = [scipy.sparse.csr_matrix((np.ones(n_obs), (np.arange(n_obs), np.arange(n_obs)) ))] + unique_values_bb = [np.vstack([X[:,1,s], total_bb_RD[:,s]]).T for s in range(n_spots)] + mapping_matrices_bb = [scipy.sparse.csr_matrix((np.ones(n_obs), (np.arange(n_obs), np.arange(n_obs)) ))] + # EM algorithm + for r in trange(max_iter): + # E step + if tumor_prop is None: + log_emission_rdr, log_emission_baf = hmm_nophasing_float.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission = log_emission_rdr + log_emission_baf + else: + # compute mu as adjusted RDR + if ((not log_gamma is None) or (r > 0)) and ("m" in self.params): + logmu_shift = [] + for c in range(len(kwargs["sample_length"])): + this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states + logmu_shift.append( scipy.special.logsumexp(log_mu[this_pred_cnv,:] + np.log(kwargs["lambd"]).reshape(-1,1), axis=0) ) + logmu_shift = np.vstack(logmu_shift) + log_emission_rdr, log_emission_baf = hmm_nophasing_float.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, logmu_shift=logmu_shift, sample_length=kwargs["sample_length"]) + else: + log_emission_rdr, log_emission_baf = hmm_nophasing_float.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop) + log_emission = log_emission_rdr + log_emission_baf + log_alpha = hmm_nophasing_float.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_beta = hmm_nophasing_float.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_gamma = compute_posterior_obs(log_alpha, log_beta) + log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission) + # M step + if "s" in self.params: + new_log_startprob = update_startprob_nophasing(lengths, log_gamma) + new_log_startprob = new_log_startprob.flatten() + else: + new_log_startprob = log_startprob + if "t" in self.params: + new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag) + else: + new_log_transmat = log_transmat + if "m" in self.params: + if tumor_prop is None: + if device == 'cpu': + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, + fun=fit_weighted_NegativeBinomial_gpu, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + if device == 'cpu': + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ + fun=fit_weighted_NegativeBinomial_mix_gpu, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu = log_mu + new_alphas = alphas + if "p" in self.params: + if tumor_prop is None: + if device == 'cpu': + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \ + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, + fun=fit_weighted_BetaBinomial_gpu, start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + # compute mu as adjusted RDR + if ("m" in self.params): + mu = [] + for c in range(len(kwargs["sample_length"])): + this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states + mu.append( np.exp(new_log_mu[this_pred_cnv,:]) / np.sum(np.exp(new_log_mu[this_pred_cnv,:]) * kwargs["lambd"].reshape(-1,1), axis=0, keepdims=True) ) + mu = np.vstack(mu) + weighted_tp = (tumor_prop * mu) / (tumor_prop * mu + 1 - tumor_prop) + else: + weighted_tp = tumor_prop + if device == 'cpu': + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \ + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \ + fun=fit_weighted_BetaBinomial_mix_gpu, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom = p_binom + new_taus = taus + # check convergence + print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \ + np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \ + np.mean(np.abs(new_log_mu - log_mu)),\ + np.mean(np.abs(new_p_binom - p_binom)) ) + print( np.hstack([new_log_mu, new_p_binom]) ) + if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \ + np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol: + break + log_startprob = new_log_startprob + log_transmat = new_log_transmat + log_mu = new_log_mu + alphas = new_alphas + p_binom = new_p_binom + taus = new_taus + return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma + + diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index d5a9145..1763272 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -14,6 +14,14 @@ from calicost.utils_hmm import * import networkx as nx +try: + import torch + device = 'cuda' if torch.cuda.is_available() else 'cpu' + from calicost.utils_distribution_fitting_gpu import * +except ImportError: + device = 'cpu' + + """ Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution. """ @@ -294,18 +302,32 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, new_log_transmat = log_transmat if "m" in self.params: if tumor_prop is None: - new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + if device == 'cpu': + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, + fun=fit_weighted_NegativeBinomial_gpu, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) else: - new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + if device == 'cpu': + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ + fun=fit_weighted_NegativeBinomial_mix_gpu, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) else: new_log_mu = log_mu new_alphas = alphas if "p" in self.params: if tumor_prop is None: - new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + if device == 'cpu': + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \ + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, + fun=fit_weighted_BetaBinomial_gpu, start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) else: # compute mu as adjusted RDR if ("m" in self.params): @@ -317,8 +339,12 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, weighted_tp = (tumor_prop * mu) / (tumor_prop * mu + 1 - tumor_prop) else: weighted_tp = tumor_prop - new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + if device == 'cpu': + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \ + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \ + fun=fit_weighted_BetaBinomial_mix_gpu, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) else: new_p_binom = p_binom new_taus = taus diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 630651f..23fa956 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -17,6 +17,14 @@ import networkx as nx +try: + import torch + device = 'cuda' if torch.cuda.is_available() else 'cpu' + from calicost.utils_distribution_fitting_gpu import * +except ImportError: + device = 'cpu' + + ############################################################ # whole inference ############################################################ @@ -285,24 +293,40 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, else: new_log_transmat = log_transmat if "m" in self.params: - # new_log_mu, new_alphas = update_emission_params_nb_sitewise(X[:,0,:], log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \ - # fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) if tumor_prop is None: - new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + if device == 'cpu': + new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, + fun=fit_weighted_NegativeBinomial_gpu, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) else: - new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + if device == 'cpu': + new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + else: + new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \ + fun=fit_weighted_NegativeBinomial_mix_gpu, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) else: new_log_mu = log_mu new_alphas = alphas if "p" in self.params: if tumor_prop is None: - new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + if device == 'cpu': + new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, start_p_binom=p_binom, \ + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, + fun=fit_weighted_BetaBinomial_gpu, start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) else: - new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + if device == 'cpu': + new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \ + fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + else: + new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \ + fun=fit_weighted_BetaBinomial_mix_gpu, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) else: new_p_binom = p_binom new_taus = taus @@ -546,10 +570,12 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total tmp = np.log10(1 - t) np.savez(f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz", \ new_log_mu=new_log_mu, new_alphas=new_alphas, new_p_binom=new_p_binom, new_taus=new_taus, \ - new_log_startprob=new_log_startprob, new_log_transmat=new_log_transmat, log_gamma=log_gamma, pred_cnv=pred_cnv, llf=llf) + new_log_startprob=new_log_startprob, new_log_transmat=new_log_transmat, log_gamma=log_gamma, pred_cnv=pred_cnv, llf=llf, + log_emission=log_emission, log_alpha=log_alpha, log_beta=log_beta) else: res = {"new_log_mu":new_log_mu, "new_alphas":new_alphas, "new_p_binom":new_p_binom, "new_taus":new_taus, \ - "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf} + "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf, + "log_emission":log_emission, "log_alpha":log_alpha, "log_beta":log_beta} return res diff --git a/src/calicost/sample_posterior_clone.py b/src/calicost/sample_posterior_clone.py new file mode 100644 index 0000000..45e069f --- /dev/null +++ b/src/calicost/sample_posterior_clone.py @@ -0,0 +1,725 @@ +import sys +import copy +import numpy as np +import scipy +import pandas as pd +from matplotlib import pyplot as plt +import seaborn + +import graph_tool.all as gt + +from calicost.utils_hmm import * +from calicost.hmm_NB_BB_phaseswitch import * +from calicost.utils_phase_switch import * +from calicost.utils_hmm import multinoulli_sampler_high_dimensional + + +def compute_potential(single_llf, labels, spatial_weight, adjacency_mat): + """ + Attributes + ---------- + single_llf : np.array, size (num_spots, num_labels) + The log-likelihood of each pixel being in each label. + labels : np.array, size (num_spots) + The label of each pixel. + spatial_weight : float + The weight of the spatial coherence term. + adjacency_mat : np.array, size (num_spots, num_spots) + The adjacency matrix of the spatial graph. Note this is a symmetric matrix with 0 diagonal. + """ + N = len(labels) + # emission part + neg_potential = single_llf[(np.arange(N), labels)].sum() + # spatial adjacency part + # make sure diagonal is 0 + if not np.all(adjacency_mat.diagonal() == 0): + adjacency_mat.setdiag(0) + # neg_potential += 0.5 * spatial_weight * np.sum(adjacency_mat * (labels[:, None] == labels[None, :])) + idx1, idx2 = adjacency_mat.nonzero() + neg_potential += spatial_weight * np.sum(labels[idx1] == labels[idx2]) + return neg_potential / N + + +def block_gibbs_sampling_labels(emission_llf, log_clonesize, adjacency_mat, block_ids, spatial_weight, num_draws, random_state, temperature, initial_logprob=None): + """ + Attributes + ---------- + emission_llf : np.array, size (num_spots, n_clones) + The log-likelihood of each pixel being in each clone label. + log_clonesize : np.array, size (num_spots, n_clones) + The prior probability of selecting each clone label for each spot. + adjacency_mat : np.array, size (num_spots, num_spots) + The adjacency matrix of the spatial graph. Note this is a symmetric matrix with 0 diagonal. + block_ids : np.array, size (num_spots) + The block id of each spot. Each block is updated together in a block Gibbs sampling step. + spatial_weight : float + The weight of the spatial coherence term. + num_draws : int + The number of draws to draw. + random_state : int + The random seed. + initial_logprob : np.array, size (num_spots, n_clones) + Initial log probability to sample clone labels. + """ + np.random.seed(random_state) + # number of spots, blocks, and clones + N, n_clones = emission_llf.shape + n_blocks = len(np.unique(block_ids)) + + # list of labels of each sampling time point + # with initial label by random sampling + if not initial_logprob is None: + list_labels = [ multinoulli_sampler_high_dimensional(initial_logprob) ] + else: + list_labels = [ multinoulli_sampler_high_dimensional(log_clonesize) ] + list_potentials = [ compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) ] + + # block Gibbs sampling + # prepare sub-adjacency matrix where rows correspond to one block, columns correspond to the other + A_sub_list = [adjacency_mat[block_ids==b, :][:, block_ids!=b] for b in range(n_blocks)] + for iter in range(num_draws): + this_l = np.zeros(N, dtype=int) + for b in range(n_blocks): + # for spots with block_id of b, compute the posterior distribution + post = copy.copy((emission_llf + log_clonesize)[block_ids==b, :]) + for c in range(n_clones): + post[:,c] += spatial_weight * A_sub_list[b] @ (list_labels[-1][block_ids!=b] == c) + + post = post / temperature + post -= scipy.special.logsumexp(post, axis=1, keepdims=True) + this_l[block_ids==b] = multinoulli_sampler_high_dimensional(post) + + list_labels.append(this_l) + # potential of the new state + list_potentials.append( compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) ) + + return list_labels, list_potentials + + +def update_bond(label, adjacency_mat, prob): + """ + Bond configuration of the S-W algorithm. + A bond is created between two adjacent nodes with probability prob (prob=1 - exp(-beta / T)) if the two nodes have the same label. + There is no bond between two nodes with different labels. + + Attributes + ---------- + label : numpy.ndarray + Label of each node. + + adjacency_mat : scipy.sparse.csr_matrix + Adjacency matrix of the graph. + + prob : float + Probability of creating a bond between two adjacent nodes. + + Returns + ------- + bond : scipy.sparse.csr_matrix + Bond matrix of the graph. + + connected_components : np.ndarray + Label for each node to belong to which connected components from the graph with original nodes and sampled bonds. + """ + # adjacency edges from the adjacency matrix + edges = scipy.sparse.triu(adjacency_mat, k=1).nonzero() + # indicator of the same label + same_label = (label[edges[0]] == label[edges[1]]) + edges = (edges[0][same_label], edges[1][same_label]) + # create bonds with probability prob + sample_bond = (np.random.rand(len(edges[0])) < prob) + edges = (edges[0][sample_bond], edges[1][sample_bond]) + # create bond matrix + bond = scipy.sparse.csr_matrix((np.ones(len(edges[0]), dtype=bool), edges), shape=adjacency_mat.shape) + bond = bond + bond.T + + # connected components by graph_tool + es = np.array( bond.nonzero() ).T + g = gt.Graph(adjacency_mat.shape[0]) + g.add_edge_list(es) + comp, hist = gt.label_components(g) + connected_components = comp.a + + return bond, connected_components + + +# def update_label(emission_llf, log_clonesize, connected_components): +# """ +# Label of each connected component is sampled randomly from the posterior distribution, which is proportional to exp(emission_llf + log_clonesize). + +# Attributes +# ---------- +# emission_llf : np.array, size (num_spots, n_clones) +# The log-likelihood of each pixel being in each clone label. +# log_clonesize : np.array, size (num_spots, n_clones) +# The prior probability of selecting each clone label for each spot. +# connected_components : np.ndarray +# Index of connected components for each node. Ranges from 0 to n_components - 1. + +# Returns +# ------- +# labels : np.ndarray +# Sampled label of each node. +# """ +# # number of connected components +# n_components = np.max(connected_components) + 1 +# assert n_components == len(np.unique(connected_components)), "Connected components are not consecutive." + +# # number of clone labels +# n_labels = emission_llf.shape[1] + +# # compute posterior distribution of each connected component by taking the average of related nodes +# post_per_node = emission_llf + log_clonesize +# post_per_component = np.zeros((n_components, n_labels)) +# for i in range(n_components): +# post_per_component[i] = np.mean(post_per_node[connected_components == i], axis=0) +# # normalize +# post_per_component -= scipy.special.logsumexp(post_per_component, axis=1, keepdims=True) +# # sample label per connected component +# l_per_component = multinoulli_sampler_high_dimensional(post_per_component) +# # assign label to each node +# labels = l_per_component[connected_components] + +# return labels + + +def update_label(emission_llf, log_clonesize, connected_components, temperature): + """ + Label of each connected component is sampled randomly from the posterior distribution, which is proportional to exp(emission_llf + log_clonesize). + + Attributes + ---------- + emission_llf : np.array, size (num_spots, n_clones) + The log-likelihood of each pixel being in each clone label. + log_clonesize : np.array, size (num_spots, n_clones) + The prior probability of selecting each clone label for each spot. + connected_components : np.ndarray + Index of connected components for each node. Ranges from 0 to n_components - 1. + temperature : float + Temperature parameter to adjust the log emission probability. + + Returns + ------- + labels : np.ndarray + Sampled label of each node. + """ + # number of connected components + n_components = np.max(connected_components) + 1 + assert n_components == len(np.unique(connected_components)), "Connected components are not consecutive." + + # number of clone labels + n_labels = emission_llf.shape[1] + + # compute posterior distribution of each connected component by taking the average of related nodes + post_per_node = emission_llf + log_clonesize + # convert connected_components into one-hot encoding scipy sparse matrix + onehot = scipy.sparse.csr_matrix((np.ones(len(connected_components), dtype=int), (np.arange(len(connected_components)), connected_components)), + shape=(len(connected_components), n_components)) + # take sum of post_per_node for each connected component + post_per_component = onehot.T.dot(post_per_node) + # divided by the number of nodes in each connected component + post_per_component = post_per_component / onehot.sum(axis=0).T.A + # scale by temperature + post_per_component = post_per_component / temperature + # normalize + post_per_component -= scipy.special.logsumexp(post_per_component, axis=1, keepdims=True) + # sample label per connected component + l_per_component = multinoulli_sampler_high_dimensional(post_per_component) + # assign label to each node + labels = l_per_component[connected_components] + + return labels + + +def SW_sampling_labels(emission_llf, log_clonesize, adjacency_mat, block_ids, spatial_weight, num_draws, random_state, temperature, initial_logprob=None): + """ + Sampling num_draws of label configurations using the S-W algorithm (a specific MCMC method). + + Attributes + ---------- + emission_llf : np.array, size (num_spots, n_clones) + The log-likelihood of each pixel being in each clone label. + log_clonesize : np.array, size (num_spots, n_clones) + The prior probability of selecting each clone label for each spot. + adjacency_mat : scipy.sparse.csr_matrix + Adjacency matrix of the graph. + spatial_weight : float + Spatial weight parameter. + num_draws : int + Number of label configurations to draw in this Markov chain. + random_state : int + Random seed. + temperature : float + Temperature parameter. Temperature and spatial_weight are used to determine the bond probability. + initial_logprob : np.array, size (num_spots, n_clones), optional + + Returns + ------- + list_labels : list of np.ndarray + List of sampled label configurations across num_draws iterations. + """ + # set random seed + np.random.seed(random_state) + + # initialization + n_spots, n_clones = emission_llf.shape + if not initial_logprob is None: + list_labels = [ multinoulli_sampler_high_dimensional(initial_logprob) ] + else: + list_labels = [ multinoulli_sampler_high_dimensional(log_clonesize) ] + list_potentials = [ compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) ] + + # sample across num_draws iterations + for iter in range(num_draws): + # update bond + prob = 1 - np.exp(-spatial_weight / temperature) + bond, connected_components = update_bond(list_labels[-1], adjacency_mat, prob) + # update label + list_labels.append( update_label(emission_llf, log_clonesize, connected_components, temperature) ) + # update potential + list_potentials.append( compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) ) + + return list_labels, list_potentials + + +def posterior_distribution_clone_labels(emission_llf, log_clonesize, adjacency_mat, coords, spatial_weight, num_chains, burnin, platform='visium', fun=block_gibbs_sampling_labels, temperature=1.0): + n_spots, n_clones = emission_llf.shape + list_labels = [] + list_potentials = [] + + # if block Gibbs sampling + if platform == 'visium': + block_ids = coords[:,1]%3 + else: + block_ids = (coords[:,0] + coords[:,1]) % 2 + + # number of chains (starting points) + for r in range(num_chains): + this_list_labels, this_list_potential = fun(emission_llf, log_clonesize, adjacency_mat, block_ids, spatial_weight, num_draws=150, random_state=r, temperature=temperature) + list_labels.append(this_list_labels) + list_potentials.append(this_list_potential) + + # stack lists + list_labels = np.stack(list_labels) + list_potentials = np.stack(list_potentials) + + # make a plot of the potential of the first chain + for r in range(num_chains): + plt.plot(list_potentials[r], alpha=0.5) + plt.xlabel('iteration') + plt.ylabel('potential') + plt.show() + + # remove burnin + list_labels = list_labels[:,burnin:, :] + list_potentials = list_potentials[:, burnin:] + + # probability of getting each clone at each spot + clone_prob = [] + for r in range(num_chains): + this_clone_prob = np.array([ np.bincount(l, minlength=n_clones) for l in list_labels[r].T ]) + this_clone_prob = this_clone_prob / list_labels[r].shape[0] + clone_prob.append( this_clone_prob ) + + # aggregate across chains to get the final posterior distribution per spot + agg_clone_prob = np.mean(clone_prob, axis=0) + return agg_clone_prob, list_labels.reshape((-1, n_spots)), list_potentials.flatten() + + +def infer_all(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_label, n_states, + coords, adjacency_mat, tumorprop_threshold, spatial_weight, platform, max_iter_outer, num_chains, burnin, sampling_tol, temperature, + hmmclass, hmm_params, hmm_t, hmm_random_state, hmm_max_iter, hmm_tol, hmm_num_draws, fun=block_gibbs_sampling_labels, + smooth_mat=None, sample_ids=None, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, + fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True): + + n_clones = len(np.unique(initial_clone_label)) + n_obs, n_spots = single_total_bb_RD.shape + + # aggregated counts according to smooth_mat + if not smooth_mat is None: + agg_single_X = np.stack([ single_X[i,:,:] @ smooth_mat for i in range(single_X.shape[0]) ]) + agg_single_base_nb_mean = single_base_nb_mean @ smooth_mat + agg_single_total_bb_RD = single_total_bb_RD @ smooth_mat + agg_single_tumor_prop = (single_tumor_prop @ smooth_mat) / np.sum(smooth_mat.A, axis=0) + else: + # aggregated matrices are the same as their original ones + agg_single_X = single_X + agg_single_base_nb_mean = single_base_nb_mean + agg_single_total_bb_RD = single_total_bb_RD + agg_single_tumor_prop = single_tumor_prop + + # make a fake log_sitewise_transmat because some function calls have that parameter for formating purpose, but it's not actually used + log_sitewise_transmat = np.zeros(n_obs) + + # the initial posterior distribution of clone labels is a binary matrix derived from initial_clone_label + posterior_clones = np.zeros((n_spots, n_clones)) + posterior_clones[ (np.arange(n_spots), initial_clone_label) ] = 1 + log_clonesize = np.ones(n_clones) * np.log(1.0 / n_clones) + log_clonesize = np.repeat( log_clonesize[None,:], single_X.shape[2], axis=0 ) + + # initial emission parameters + last_log_mu = init_log_mu + last_alphas = init_alphas + last_p_binom = init_p_binom + last_taus = init_taus + + list_posterior_clones = [posterior_clones] + list_cna_states = [] + list_emission_llf = [] + list_log_mu = [] + list_p_binom = [] + list_elbo = [] + + for r in range(max_iter_outer): + ##### Fit HMM using posterior_clones ##### + # aggregate into pseudobulk for each clone weighted by posterior_clones + X = (single_X[:,:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]) + base_nb_mean = single_base_nb_mean[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] + total_bb_RD = single_total_bb_RD[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] + tumor_prop = single_tumor_prop[single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] / posterior_clones[single_tumor_prop>tumorprop_threshold, :].sum(axis=0) + + # initialize parameters + if (last_log_mu is None and "m" in hmm_params) or (last_p_binom is None and "p" in hmm_params): + tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ + base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), hmm_params, random_state=hmm_random_state, in_log_space=False, only_minor=False) + + last_log_mu = tmp_log_mu if init_log_mu is None and "m" in hmm_params else None + last_p_binom = tmp_p_binom if init_p_binom is None and "p" in hmm_params else None + + # fit HMM + res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \ + base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1), \ + hmmclass=hmmclass, params=hmm_params, t=hmm_t, random_state=hmm_random_state, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ + is_diag=True, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=hmm_max_iter, tol=hmm_tol) + # estimate log emission probability for each spot belong to each clone based on the fitted HMM res + list_h = FFBS_faster(hmm_num_draws, res['log_alpha'], res['log_gamma'], res['new_log_transmat'], np.tile(lengths, X.shape[2])) + # emission probability of each spot under the posterior distribution of hidden states of each clone + log_prob_rdr, log_prob_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([agg_single_X[:,0,:].flatten("F"), agg_single_X[:,1,:].flatten("F")]).T.reshape(-1,2,1), + agg_single_base_nb_mean.flatten("F").reshape(-1,1), res['new_log_mu'], res['new_alphas'], + agg_single_total_bb_RD.flatten("F").reshape(-1,1), res['new_p_binom'], res['new_taus'], + np.repeat(agg_single_tumor_prop, agg_single_X.shape[0]).reshape(-1,1)) + log_per_state_emission = log_prob_rdr + log_prob_baf + + emission_llf = np.zeros((agg_single_X.shape[2], n_clones)) + for s in range(agg_single_X.shape[2]): + this_log_emission = log_per_state_emission[:,(s*n_obs):(s*n_obs+n_obs), 0] + for c in range(n_clones): + sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)] + # emission probability + emission_llf[s,c] = this_log_emission[(sampled_h, np.arange(n_obs))].sum() + # take average + emission_llf[s,c] /= hmm_num_draws + + # save results + list_emission_llf.append(emission_llf) + list_log_mu.append(res['new_log_mu']) + list_p_binom.append(res['new_p_binom']) + list_cna_states.append( np.stack([res['pred_cnv'][(i*n_obs):(i*n_obs+n_obs)] for i in range(n_clones)]) ) + + # update last_log_mu, last_p_binom, last_alphas, last_taus + last_log_mu = res['new_log_mu'] + last_p_binom = res['new_p_binom'] + last_alphas = res['new_alphas'] + last_taus = res['new_taus'] + + ##### Infer clone labels using the estimated log emission probability ##### + posterior_clones, list_labels, list_potentials = posterior_distribution_clone_labels(emission_llf, log_clonesize, adjacency_mat, coords, spatial_weight, num_chains, burnin, platform=platform, temperature=temperature, fun=fun) + list_posterior_clones.append(posterior_clones) + + plot_posterior_clones_single(list_posterior_clones, coords, len(list_posterior_clones)-1, sample_ids) + plt.show() + # update log clone size + log_clonesize = np.mean(posterior_clones, axis=0) + log_clonesize = np.repeat( np.log(log_clonesize)[None,:], single_X.shape[2], axis=0 ) + + # update elbo + prior_log_prob = np.zeros(n_clones) + for c in range(n_clones): + sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)] + prior_log_prob[c] = compute_state_prior_prob_highdim(sampled_h, res['new_log_startprob'], res['new_log_transmat'], lengths).mean() + + eventual_llf = [compute_potential(emission_llf + log_clonesize, l, spatial_weight, adjacency_mat) + prior_log_prob.sum() for l in list_labels] + list_elbo.append( np.mean(eventual_llf) ) + + # compare the difference between the current and previous posterior_clones + if np.square(list_posterior_clones[-1] - list_posterior_clones[-2]).mean() < sampling_tol: + break + + ##### temperature annealing ##### + temperature = max(1.0, 0.95 * temperature) + # if r > 2: + # temperature = max(1.0, temperature-2) + + return list_posterior_clones, list_cna_states, list_emission_llf, list_log_mu, list_p_binom, list_elbo + + + +def infer_all_v2(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_label, n_states, + coords, adjacency_mat, tumorprop_threshold, spatial_weight, platform, max_iter_outer, num_chains, burnin, sampling_tol, temperature, + hmmclass, hmm_params, hmm_t, hmm_random_state, hmm_max_iter, hmm_tol, hmm_num_draws, fun=block_gibbs_sampling_labels, + smooth_mat=None, sample_ids=None, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, + fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True): + """ + initial_clone_label : one hot encoding of initial clones, or probability of each spot being in each clone. + """ + n_clones = initial_clone_label.shape[1] + n_obs, n_spots = single_total_bb_RD.shape + + # aggregated counts according to smooth_mat + if not smooth_mat is None: + agg_single_X = np.stack([ single_X[i,:,:] @ smooth_mat for i in range(single_X.shape[0]) ]) + agg_single_base_nb_mean = single_base_nb_mean @ smooth_mat + agg_single_total_bb_RD = single_total_bb_RD @ smooth_mat + agg_single_tumor_prop = (single_tumor_prop @ smooth_mat) / np.sum(smooth_mat.A, axis=0) + else: + # aggregated matrices are the same as their original ones + agg_single_X = single_X + agg_single_base_nb_mean = single_base_nb_mean + agg_single_total_bb_RD = single_total_bb_RD + agg_single_tumor_prop = single_tumor_prop + + # make a fake log_sitewise_transmat because some function calls have that parameter for formating purpose, but it's not actually used + log_sitewise_transmat = np.zeros(n_obs) + + # the initial posterior distribution of clone labels is a binary matrix derived from initial_clone_label + posterior_clones = initial_clone_label + log_clonesize = np.ones(n_clones) * np.log(1.0 / n_clones) + log_clonesize = np.repeat( log_clonesize[None,:], single_X.shape[2], axis=0 ) + + # initial emission parameters + last_log_mu = init_log_mu + last_alphas = init_alphas + last_p_binom = init_p_binom + last_taus = init_taus + + list_posterior_clones = [initial_clone_label] + list_cna_states = [] + list_emission_llf = [] + list_log_mu = [] + list_alphas = [] + list_p_binom = [] + list_taus = [] + list_log_startprob = [] + list_elbo = [] + + for r in range(max_iter_outer): + ##### Fit HMM using posterior_clones ##### + # aggregate into pseudobulk for each clone weighted by posterior_clones + X = (single_X[:,:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]) + base_nb_mean = single_base_nb_mean[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] + total_bb_RD = single_total_bb_RD[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] + tumor_prop = single_tumor_prop[single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] / posterior_clones[single_tumor_prop>tumorprop_threshold, :].sum(axis=0) + + # initialize parameters + if (last_log_mu is None and "m" in hmm_params) or (last_p_binom is None and "p" in hmm_params): + tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ + base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), hmm_params, random_state=hmm_random_state, in_log_space=False, only_minor=False) + + last_log_mu = tmp_log_mu if init_log_mu is None and "m" in hmm_params else None + last_p_binom = tmp_p_binom if init_p_binom is None and "p" in hmm_params else None + + # fit HMM + res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \ + base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1), \ + hmmclass=hmmclass, params=hmm_params, t=hmm_t, random_state=hmm_random_state, \ + fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ + is_diag=True, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=hmm_max_iter, tol=hmm_tol) + # estimate log emission probability for each spot belong to each clone based on the fitted HMM res + list_h = FFBS_faster(hmm_num_draws, res['log_alpha'], res['log_gamma'], res['new_log_transmat'], np.tile(lengths, X.shape[2])) + # emission probability of each spot under the posterior distribution of hidden states of each clone + log_prob_rdr, log_prob_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([agg_single_X[:,0,:].flatten("F"), agg_single_X[:,1,:].flatten("F")]).T.reshape(-1,2,1), + agg_single_base_nb_mean.flatten("F").reshape(-1,1), res['new_log_mu'], res['new_alphas'], + agg_single_total_bb_RD.flatten("F").reshape(-1,1), res['new_p_binom'], res['new_taus'], + np.repeat(agg_single_tumor_prop, agg_single_X.shape[0]).reshape(-1,1)) + log_per_state_emission = log_prob_rdr + log_prob_baf + + emission_llf = np.zeros((agg_single_X.shape[2], n_clones)) + for s in range(agg_single_X.shape[2]): + this_log_emission = log_per_state_emission[:,(s*n_obs):(s*n_obs+n_obs), 0] + for c in range(n_clones): + sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)] + # emission probability + emission_llf[s,c] = this_log_emission[(sampled_h, np.arange(n_obs))].sum() + # take average + emission_llf[s,c] /= hmm_num_draws + + # save results + list_emission_llf.append( emission_llf ) + list_log_mu.append(res['new_log_mu']) + list_alphas.append(res['new_alphas']) + list_p_binom.append(res['new_p_binom']) + list_taus.append(res['new_taus']) + list_log_startprob.append(res['new_log_startprob']) + list_cna_states.append( np.stack([res['pred_cnv'][(i*n_obs):(i*n_obs+n_obs)] for i in range(n_clones)]) ) + + # update last_log_mu, last_p_binom, last_alphas, last_taus + last_log_mu = res['new_log_mu'] + last_p_binom = res['new_p_binom'] + last_alphas = res['new_alphas'] + last_taus = res['new_taus'] + + ##### Infer clone labels using the estimated log emission probability ##### + posterior_clones, list_labels, list_potentials = posterior_distribution_clone_labels(emission_llf, log_clonesize, adjacency_mat, coords, spatial_weight, num_chains, burnin, platform=platform, temperature=temperature, fun=fun) + list_posterior_clones.append(posterior_clones) + + plot_posterior_clones_single(list_posterior_clones, coords, len(list_posterior_clones)-1, sample_ids) + plt.show() + # update log clone size + log_clonesize = np.mean(posterior_clones, axis=0) + log_clonesize = np.repeat( np.log(log_clonesize)[None,:], single_X.shape[2], axis=0 ) + + # update elbo + prior_log_prob = np.zeros(n_clones) + for c in range(n_clones): + sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)] + prior_log_prob[c] = compute_state_prior_prob_highdim(sampled_h, res['new_log_startprob'], res['new_log_transmat'], lengths).mean() + + # eventual_llf = [compute_potential(emission_llf + log_clonesize, l, spatial_weight, adjacency_mat) + prior_log_prob.sum() for l in list_labels] + eventual_llf = [compute_potential(emission_llf + log_clonesize, l, spatial_weight, adjacency_mat) for l in list_labels] + list_elbo.append( np.mean(eventual_llf) ) + + # compare the difference between the current and previous posterior_clones + if np.square(list_posterior_clones[-1] - list_posterior_clones[-2]).mean() < sampling_tol: + break + + ##### temperature annealing ##### + temperature = max(1.0, 0.95 * temperature) + # if r > 2: + # temperature = max(1.0, temperature-2) + + return list_posterior_clones, list_cna_states, list_emission_llf, list_log_mu,list_alphas, list_p_binom, list_taus, list_log_startprob, list_elbo, list_h, list_labels + + + +def plot_posterior_clones_single(list_posterior_clones, coords, idx, sample_ids=None): + """ + Scatterplot of the probability of each spot being in each clone for iteration idx. + Attributes + ---------- + list_posterior_clones : list of np.arrays, each one has size (num_spots, num_clones) + The posterior probability of each spot being in each clone at each iteration, output of infer_all + coords : np.array, size (num_spots, 2) + The spatial coordinates of each spot. + sample_ids : np.array, size (num_spots) + The sample id of each spot. + idx : int + The iteration index to plot. + """ + shifted_coords = copy.copy(coords) + if not sample_ids is None: + offset = 0 + for s in np.sort(np.unique(sample_ids)): + min_x = np.min(shifted_coords[sample_ids==s, 0]) + shifted_coords[sample_ids==s, 0] = shifted_coords[sample_ids==s, 0] - min_x + offset + offset = np.max(shifted_coords[sample_ids==s, 0]) + 1 + + n_clones = list_posterior_clones[idx].shape[1] + fig, axes = plt.subplots(1, n_clones, figsize=(n_clones * 5*len(np.unique(sample_ids)), 5), facecolor='white', dpi=150) + for c in range(list_posterior_clones[idx].shape[1]): + axes[c].scatter(x=shifted_coords[:,0], y=-shifted_coords[:,1], c=list_posterior_clones[idx][:,c], cmap='magma_r', s=15, linewidth=0) + axes[c].set_title('Clone %d' % c) + norm = plt.Normalize(list_posterior_clones[idx][:,c].min(), list_posterior_clones[idx][:,c].max()) + axes[c].figure.colorbar( plt.cm.ScalarMappable(cmap='magma_r', norm=norm), ax=axes[c] ) + axes[c].axis('off') + fig.tight_layout() + return fig + + +def plot_posterior_clones_interactive(list_posterior_clones, coords, giffile, sample_ids=None, base_duration=500): + import io + import imageio + + shifted_coords = copy.copy(coords) + if not sample_ids is None: + offset = 0 + for s in np.sort(np.unique(sample_ids)): + min_x = np.min(shifted_coords[sample_ids==s, 0]) + shifted_coords[sample_ids==s, 0] = shifted_coords[sample_ids==s, 0] - min_x + offset + offset = np.max(shifted_coords[sample_ids==s, 0]) + 1 + + n_clones = list_posterior_clones[0].shape[1] + # List to store images and durations + images = [] + # durations = base_duration * np.arange(len(list_posterior_clones)) # Different durations for each frame + + # Generate scatter plots and store them in memory + for idx in range(len(list_posterior_clones)): + fig, axes = plt.subplots(1, n_clones, figsize=(n_clones * 5*len(np.unique(sample_ids)), 5), dpi=150) + for c in range(list_posterior_clones[idx].shape[1]): + axes[c].scatter(x=shifted_coords[:,0], y=-shifted_coords[:,1], c=list_posterior_clones[idx][:,c], cmap='magma_r', s=15, linewidth=0) + axes[c].set_title('Clone %d' % c) + norm = plt.Normalize(list_posterior_clones[idx][:,c].min(), list_posterior_clones[idx][:,c].max()) + axes[c].figure.colorbar( plt.cm.ScalarMappable(cmap='magma_r', norm=norm), ax=axes[c] ) + axes[c].axis('off') + fig.suptitle(f"interation {idx}") + fig.tight_layout() + buf = io.BytesIO() + plt.savefig(buf, format='png') + buf.seek(0) + images.append(imageio.imread(buf)) + buf.close() + plt.close() + + # Create a GIF from the in-memory images with different durations for each frame + imageio.mimsave(giffile, images, duration=base_duration) + + +# def plot_posterior_clones_interactive(list_posterior_clones, coords, sample_ids=None): +# import mpl_interactions.ipyplot as iplt + +# def c_func(x, y, ite): +# return list_posterior_clones[ite][:,c] + +# shifted_coords = copy.copy(coords) +# if not sample_ids is None: +# offset = 0 +# for s in np.sort(np.unique(sample_ids)): +# min_x = np.min(shifted_coords[sample_ids==s, 0]) +# shifted_coords[sample_ids==s, 0] = shifted_coords[sample_ids==s, 0] - min_x + offset +# offset = np.max(shifted_coords[sample_ids==s, 0]) + 1 + +# # make a gif of the posterior probability of each clone in multiple figure panels +# n_clones = list_posterior_clones[0].shape[1] +# fig, axes = plt.subplots(1, n_clones, figsize=(n_clones * 5*len(np.unique(sample_ids)), 5), facecolor='white', dpi=150) +# for c in range(list_posterior_clones[0].shape[1]): +# _ = iplt.scatter(x=shifted_coords[:,0], y=-shifted_coords[:,0], ite=np.arange(len(list_posterior_clones)), c=c_func, cmap='magma_r', s=15, linewidth=0, ax=axes[c]) +# fig.tight_layout() +# return fig + + +def plot_posterior_baf_cnstate_single(single_X, single_total_bb_RD, single_tumor_prop, lengths, tumorprop_threshold, list_posterior_clones, list_cna_states, list_p_binom, idx, unique_chrs): + """ + Scatterplot BAF along the genome colored by the hidden states at iteration idx. + """ + posterior_clones = list_posterior_clones[idx] + X = (single_X[:,:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]) + total_bb_RD = single_total_bb_RD[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] + + n_clones = list_posterior_clones[idx].shape[1] + n_obs = single_X.shape[0] + + fig, axes = plt.subplots(n_clones, 1, figsize=(20,1.8*n_clones), sharex=True, sharey=True, dpi=150, facecolor="white") + for s in np.arange(n_clones): + cid = s + this_pred = list_cna_states[idx][s,:] + segments, labs = get_intervals(this_pred) + seaborn.scatterplot(x=np.arange(X[:,1,cid].shape[0]), y=X[:,1,cid]/total_bb_RD[:,cid], \ + hue=pd.Categorical(this_pred, categories=np.arange(20), ordered=True), palette="tab10", s=15, linewidth=0, alpha=0.8, legend=False, ax=axes[s]) + axes[s].set_ylabel(f"clone {cid}\nphased AF") + axes[s].set_ylim([-0.1, 1.1]) + axes[s].set_yticks([0, 0.5, 1]) + axes[s].set_xticks([]) + for i, seg in enumerate(segments): + axes[s].plot(seg, [list_p_binom[idx][labs[i],0], list_p_binom[idx][labs[i],0]], c="black", linewidth=3) + axes[s].plot(seg, [1-list_p_binom[idx][labs[i],0], 1-list_p_binom[idx][labs[i],0]], c="black", linewidth=3) + + for i in range(len(lengths)): + median_len = (np.sum(lengths[:(i)]) + np.sum(lengths[:(i+1)])) / 2 + axes[-1].text(median_len, -0.21, unique_chrs[i]) + for k in range(n_clones): + axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1) + + fig.tight_layout() + fig.subplots_adjust(hspace=0) + return fig \ No newline at end of file diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 7e3cbbb..aa8e2c1 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -78,6 +78,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): **kwds) +def fit_weighted_NegativeBinomial(y, features, weights, exposure, start_log_mu=None, start_alpha=None): + model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure) + if start_log_mu is None: + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + else: + res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_log_mu, start_alpha), xtol=1e-4, ftol=1e-4) + n_llf = model.nloglikeobs(res.params) + return res.params, n_llf + + class Weighted_NegativeBinomial_mix(GenericLikelihoodModel): def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds): super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds) @@ -106,6 +116,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): **kwds) +def fit_weighted_NegativeBinomial_mix(y, features, weights, exposure, tumor_prop, start_log_mu=None, start_alpha=None): + model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tumor_prop) + if start_log_mu is None: + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + else: + res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_log_mu, start_alpha), xtol=1e-4, ftol=1e-4) + n_llf = model.nloglikeobs(res.params) + return res.params, n_llf + + class Weighted_BetaBinom(GenericLikelihoodModel): """ Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. @@ -149,6 +169,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): **kwds) +def fit_weighted_BetaBinomial(y, features, weights, exposure, start_p_binom=None, start_tau=None): + model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) + if start_p_binom is None: + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + else: + res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_p_binom, start_tau), xtol=1e-4, ftol=1e-4) + n_llf = model.nloglikeobs(res.params) + return res.params, n_llf + + class Weighted_BetaBinom_mix(GenericLikelihoodModel): def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds): super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds) @@ -175,6 +205,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): **kwds) +def fit_weighted_BetaBinomial_mix(y, features, weights, exposure, tumor_prop, start_p_binom=None, start_tau=None): + model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tumor_prop) + if start_p_binom is None: + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + else: + res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_p_binom, start_tau), xtol=1e-4, ftol=1e-4) + n_llf = model.nloglikeobs(res.params) + return res.params, n_llf + + class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): def __init__(self, endog, exog, tau, weights, exposure, **kwds): super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds) diff --git a/src/calicost/utils_distribution_fitting_gpu.py b/src/calicost/utils_distribution_fitting_gpu.py new file mode 100644 index 0000000..fbb5544 --- /dev/null +++ b/src/calicost/utils_distribution_fitting_gpu.py @@ -0,0 +1,218 @@ +import numpy as np +import scipy + +try: + import torch + import torch.nn as nn + import torch.nn.functional as F + device = 'cuda' if torch.cuda.is_available() else 'cpu' +except ImportError: + device = 'cpu' + + +def nb_llf_allow_float(nb_mean, alphas, y): + """ + Compute the log likelihood of negative binomial distribution allow float values in y. + + Attributes: + nb_mean: torch.tensor. the mean of negative binomial distribution. + alphas: torch.tensor. the inverse-dispersion parameter of negative binomial distribution. variance = mean + alphas * mean^2. + y: torch.tensor. the observed values. + """ + llf = (y + 1.0/alphas).lgamma() - (1.0/alphas).lgamma() - (y+1).lgamma() - 1.0/alphas*torch.log(1 + nb_mean*alphas) - y * torch.log(1 + nb_mean*alphas) + y * torch.log(alphas) + y * torch.log(nb_mean) + return llf + + +def fit_weighted_NegativeBinomial_gpu(y, features, weights, exposure, start_log_mu=None, start_alpha=None, n_epochs=1000): + # convert to torch + y = torch.from_numpy(y).to(torch.float32).to(device) + features = torch.from_numpy(features).to(torch.float32).to(device) + weights = torch.from_numpy(weights).to(torch.float32).to(device) + exposure = torch.from_numpy(exposure).to(torch.float32).to(device) + + # train + if start_log_mu is None: + log_mu = nn.Parameter(torch.zeros(features.shape[1], device=device), requires_grad=True) + else: + log_mu = nn.Parameter(torch.from_numpy(start_log_mu.flatten()).to(torch.float32).to(device), requires_grad=True) + if start_alpha is None: + log_disp = nn.Parameter(torch.log(0.1 * torch.ones(1, device=device)), requires_grad=True) + else: + log_disp = nn.Parameter(torch.log(start_alpha * torch.ones(1, device=device)), requires_grad=True) + + loss_list = [] + optimizer = torch.optim.AdamW( [log_mu, log_disp], lr=0.1) + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1) + for epoch in range(int(n_epochs / 20)): + small_loss = [] + for small_epoch in range(20): + optimizer.zero_grad() + # negative binomial llf + nb_mean = torch.exp(features @ log_mu) * exposure + nb_var = nb_mean + nb_mean**2 * torch.exp(log_disp) + # convert parameters + p = 1.0 / (1.0 + nb_mean * torch.exp(log_disp)) + n = 1.0 / torch.exp(log_disp) + llf = torch.distributions.negative_binomial.NegativeBinomial(n, 1-p).log_prob(y) + loss = -torch.matmul(llf, weights) + small_loss.append( loss.item() ) + loss.backward() + optimizer.step() + scheduler.step() + loss_list.append( np.mean(small_loss) ) + # decide to terminate + if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y): + break + + res = log_mu.detach().cpu().numpy().reshape(-1,1) + res = np.append(res, torch.exp(log_disp).detach().cpu().numpy()[0]) + return res, loss_list[-1] + + +def fit_weighted_NegativeBinomial_mix_gpu(y, features, weights, exposure, tumor_prop, start_log_mu=None, start_alpha=None, n_epochs=1000): + # convert to torch + y = torch.from_numpy(y).to(torch.float32).to(device) + features = torch.from_numpy(features).to(torch.float32).to(device) + weights = torch.from_numpy(weights).to(torch.float32).to(device) + exposure = torch.from_numpy(exposure).to(torch.float32).to(device) + tumor_prop = torch.from_numpy(tumor_prop).to(torch.float32).to(device) + + # train + if start_log_mu is None: + log_mu = nn.Parameter(torch.zeros(features.shape[1], device=device), requires_grad=True) + else: + log_mu = nn.Parameter(torch.from_numpy(start_log_mu.flatten()).to(torch.float32).to(device), requires_grad=True) + if start_alpha is None: + log_disp = nn.Parameter(torch.log(0.1 * torch.ones(1, device=device)), requires_grad=True) + else: + log_disp = nn.Parameter(torch.log(start_alpha * torch.ones(1, device=device)), requires_grad=True) + + loss_list = [] + optimizer = torch.optim.AdamW( [log_mu, log_disp], lr=0.1) + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1) + for epoch in range(int(n_epochs / 20)): + small_loss = [] + for small_epoch in range(20): + optimizer.zero_grad() + # negative binomial llf + nb_mean = exposure * (torch.exp(features @ log_mu) * tumor_prop + (1 - tumor_prop)) + # nb_var = nb_mean + nb_mean**2 * torch.exp(log_disp) + # convert parameters + p = 1.0 / (1.0 + nb_mean * torch.exp(log_disp)) + n = 1.0 / torch.exp(log_disp) + llf = torch.distributions.negative_binomial.NegativeBinomial(n, 1-p).log_prob(y) + loss = -torch.matmul(llf, weights) + small_loss.append( loss.item() ) + loss.backward() + optimizer.step() + scheduler.step() + loss_list.append( np.mean(small_loss) ) + # decide to terminate + if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y): + break + + res = log_mu.detach().cpu().numpy().reshape(-1,1) + res = np.append(res, torch.exp(log_disp).detach().cpu().numpy()[0]) + return res, loss_list[-1] + + + +def fit_weighted_BetaBinomial_gpu(y, features, weights, exposure, start_p_binom=None, start_tau=None, min_binom_prob=0.01, max_binom_prob=0.99, + n_epochs=1000, MIN_TAUS = np.log(5), MAX_TAUS = np.log(1000)): + y = torch.from_numpy(y).to(torch.float32).to(device) + features = torch.from_numpy(features).to(torch.float32).to(device) + weights = torch.from_numpy(weights).to(torch.float32).to(device) + exposure = torch.from_numpy(exposure).to(torch.float32).to(device) + + # initialize training parameters + if start_p_binom is None: + logistic_p = nn.Parameter(torch.logit(0.3 * torch.ones(features.shape[1], device=device)), requires_grad=True) + else: + logistic_p = nn.Parameter(torch.logit(torch.from_numpy(start_p_binom.flatten()).to(torch.float32).to(device)), requires_grad=True) + if start_tau is None: + log_taus = nn.Parameter(torch.log(20 * torch.ones(1, device=device)), requires_grad=True) + else: + log_taus = nn.Parameter(torch.log(start_tau * torch.ones(1, device=device)), requires_grad=True) + + MIN_LOGISTIC_P = scipy.special.logit(min_binom_prob) + MAX_LOGISTIC_P = scipy.special.logit(max_binom_prob) + loss_list = [] + optimizer = torch.optim.AdamW( [logistic_p, log_taus], lr=5e-2) + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1) + for epoch in range(int(n_epochs / 20)): + small_loss = [] + for small_epoch in range(20): + optimizer.zero_grad() + # beta binomial llf + p = F.sigmoid(logistic_p) + d_choose_y = (exposure + 1).lgamma() - (y + 1).lgamma() - ((exposure - y) + 1).lgamma() + logbeta_numer = (y + (features @ p) * torch.exp(log_taus)).lgamma() + (exposure - y + (1 - features @ p) * torch.exp(log_taus)).lgamma() - (exposure + torch.exp(log_taus)).lgamma() + logbeta_denom = ((features @ p) * torch.exp(log_taus)).lgamma() + ((1 - features @ p) * torch.exp(log_taus)).lgamma() - torch.exp(log_taus).lgamma() + llf = d_choose_y + logbeta_numer - logbeta_denom + loss = -torch.matmul(llf, weights) + small_loss.append( loss.item() ) + loss.backward() + log_taus.data = torch.clamp(log_taus.data, min=MIN_TAUS, max=MAX_TAUS) + logistic_p.data = torch.clamp(logistic_p.data, min=MIN_LOGISTIC_P, max=MAX_LOGISTIC_P) + optimizer.step() + scheduler.step() + loss_list.append( np.mean(small_loss) ) + if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y): + break + + res = F.sigmoid(logistic_p).detach().cpu().numpy().reshape(-1,1) + res = np.append(res, torch.exp(log_taus).detach().cpu().numpy()[0]) + return res, loss_list[-1] + + +def fit_weighted_BetaBinomial_mix_gpu(y, features, weights, exposure, tumor_prop, start_p_binom=None, start_tau=None, min_binom_prob=0.01, max_binom_prob=0.99, + n_epochs=1000, MIN_TAUS = np.log(5), MAX_TAUS = np.log(1000)): + y = torch.from_numpy(y).to(torch.float32).to(device) + features = torch.from_numpy(features).to(torch.float32).to(device) + weights = torch.from_numpy(weights).to(torch.float32).to(device) + exposure = torch.from_numpy(exposure).to(torch.float32).to(device) + tumor_prop = torch.from_numpy(tumor_prop).to(torch.float32).to(device) + + # initialize training parameters + if start_p_binom is None: + logistic_p = nn.Parameter(torch.logit(0.3 * torch.ones(features.shape[1], device=device)), requires_grad=True) + else: + logistic_p = nn.Parameter(torch.logit(torch.from_numpy(start_p_binom.flatten()).to(torch.float32).to(device)), requires_grad=True) + if start_tau is None: + log_taus = nn.Parameter(torch.log(20 * torch.ones(1, device=device)), requires_grad=True) + else: + log_taus = nn.Parameter(torch.log(start_tau * torch.ones(1, device=device)), requires_grad=True) + + MIN_LOGISTIC_P = scipy.special.logit(min_binom_prob) + MAX_LOGISTIC_P = scipy.special.logit(max_binom_prob) + loss_list = [] + optimizer = torch.optim.AdamW( [logistic_p, log_taus], lr=5e-2) + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1) + for epoch in range(int(n_epochs / 20)): + small_loss = [] + for small_epoch in range(20): + optimizer.zero_grad() + # beta binomial llf + # pure-tumor BAF + p_tmp = F.sigmoid(logistic_p) + # shift BAF to 0.5 by weighted-tumor proportion for each genomic segment (weighted by total copy) + p = ((features @ p_tmp) * tumor_prop + 0.5 * (1 - tumor_prop)) # a vector of length num_genomic_segments + d_choose_y = (exposure + 1).lgamma() - (y + 1).lgamma() - ((exposure - y) + 1).lgamma() + logbeta_numer = (y + p * torch.exp(log_taus)).lgamma() + (exposure - y + (1 - p) * torch.exp(log_taus)).lgamma() - (exposure + torch.exp(log_taus)).lgamma() + logbeta_denom = (p * torch.exp(log_taus)).lgamma() + ((1 - p) * torch.exp(log_taus)).lgamma() - torch.exp(log_taus).lgamma() + llf = d_choose_y + logbeta_numer - logbeta_denom + # weight by weights, and compute gradient + loss = -torch.matmul(llf, weights) + small_loss.append( loss.item() ) + loss.backward() + log_taus.data = torch.clamp(log_taus.data, min=MIN_TAUS, max=MAX_TAUS) + logistic_p.data = torch.clamp(logistic_p.data, min=MIN_LOGISTIC_P, max=MAX_LOGISTIC_P) + optimizer.step() + scheduler.step() + loss_list.append( np.mean(small_loss) ) + if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y): + break + + res = F.sigmoid(logistic_p).detach().cpu().numpy().reshape(-1,1) + res = np.append(res, torch.exp(log_taus).detach().cpu().numpy()[0]) + return res, loss_list[-1] \ No newline at end of file diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 9c22aaf..2a243d3 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -7,6 +7,141 @@ from calicost.utils_distribution_fitting import * +############################################################# +# Sampling hidden state sequences from posterior distribution +############################################################# + +def multinoulli_sampler(log_probs): + # assume log_probs is already normalized to have logsumexp = 0 + cdf = np.cumsum(np.exp(log_probs)) + cdf[-1] = 1.1 # to avoid numerical issue + u = np.random.uniform() + return np.argmax(cdf > u) + + +def FFBS(num_sequences, log_alpha, log_gamma, log_transmat, lengths): + """ + Attributes + ---------- + num_sequences: number of hidden sequences to draw. + log_alpha: n_states * n_observations per chromosome. Log of forward DP table. alpha[i,t] = P(o_1, ..., o_t, q_t=i | lambda). + log_gamma: n_states * n_observations per chromosome. Log of posterior probability. gamma[i,t] = P(q_t=i | o_1, ..., o_T, lambda + log_transmat: n_states * n_states. Transition probability after log transformation. + log_startprob: n_states. Start probability after log transformation. + lengths: sum of lengths = n_observations. Each length defines a independent HMM sequence + + Returns + ------- + list_h: list of hidden state sequences. The length of list is num_sequences. + """ + def pre_compute_S(log_alpha, log_transmat, lengths): + n_states = log_alpha.shape[0] + n_obs = log_alpha.shape[1] + log_S = np.zeros((n_obs, n_states, n_states)) + cumlen = 0 + for le in lengths: + for t in np.arange(le): + for i in np.arange(n_states): + for j in np.arange(n_states): + if t == le - 1: + log_S[cumlen+t,i,j] = log_gamma[i,cumlen+t] + else: + log_S[cumlen+t,i,j] = log_alpha[i,cumlen+t] + log_transmat[i,j] + cumlen += le + # normalize + log_S = log_S - scipy.special.logsumexp(log_S, axis=1, keepdims=True) + return log_S + # + log_S = pre_compute_S(log_alpha, log_transmat, lengths) + # sample hidden state sequences + list_h = [] + for i in trange(num_sequences): + this_h = [] + for t in np.arange(np.sum(lengths)-1, -1, -1): + if t == np.sum(lengths)-1: + this_h.append( multinoulli_sampler(log_S[t, :, 0]) ) + else: + this_h.append( multinoulli_sampler(log_S[t, :, this_h[-1] ]) ) + list_h.append( np.array(this_h[::-1]) ) + return list_h + + +def multinoulli_sampler_high_dimensional(log_probs): + """ + log_probs: n_obs * n_states. Log probability. Assume log_probs[i,:] is normalized to have logsumexp = 0 + """ + cdf = np.cumsum(np.exp(log_probs), axis=1) + cdf[:, -1] = 1.1 # to avoid numerical issue + u = np.random.uniform(size=log_probs.shape[0]) + return np.sum(cdf <= u.reshape(-1,1), axis=1) + + +def FFBS_faster(num_sequences, log_alpha, log_gamma, log_transmat, lengths): + """ + Attributes + ---------- + num_sequences: number of hidden sequences to draw. + log_alpha: n_states * n_observations per chromosome. Log of forward DP table. alpha[i,t] = P(o_1, ..., o_t, q_t=i | lambda). + log_gamma: n_states * n_observations per chromosome. Log of posterior probability. gamma[i,t] = P(q_t=i | o_1, ..., o_T, lambda + log_transmat: n_states * n_states. Transition probability after log transformation. + log_startprob: n_states. Start probability after log transformation. + lengths: sum of lengths = n_observations. Each length defines a independent HMM sequence + + Returns + ------- + list_h: list of hidden state sequences. The length of list is num_sequences. + """ + def pre_compute_S(log_alpha, log_transmat, lengths): + n_states = log_alpha.shape[0] + n_obs = log_alpha.shape[1] + log_S = np.zeros((n_obs, n_states, n_states)) + cumlen = 0 + for le in lengths: + for t in np.arange(le): + for i in np.arange(n_states): + for j in np.arange(n_states): + if t == le - 1: + log_S[cumlen+t,i,j] = log_gamma[i,cumlen+t] + else: + log_S[cumlen+t,i,j] = log_alpha[i,cumlen+t] + log_transmat[i,j] + cumlen += le + # normalize + log_S = log_S - scipy.special.logsumexp(log_S, axis=1, keepdims=True) + return log_S + # + log_S = pre_compute_S(log_alpha, log_transmat, lengths) + # sample hidden state sequences + list_h = [] + for t in np.arange(np.sum(lengths)-1, -1, -1): + if t == np.sum(lengths)-1: + list_h.append( multinoulli_sampler_high_dimensional(log_S[t, :, np.zeros(num_sequences,dtype=int)]) ) + else: + list_h.append( multinoulli_sampler_high_dimensional(log_S[t, :, list_h[-1] ]) ) + list_h = np.array(list_h).T + # reverse along the second dimension + list_h = list_h[:,::-1] + return list_h + + +def compute_state_prior_prob_highdim(h, log_startprob, log_transmat, lengths): + """ + Attributes + ---------- + h: hidden state sequence of size n_sequences * n_obs. + log_startprob: n_states. Start probability after log transformation. + log_transmat: n_states * n_states. Transition probability after log transformation. + lengths: sum of lengths = n_observations. Each length defines a independent HMM sequence. + """ + num_sequences = h.shape[0] + logprob = np.zeros(num_sequences) + cumlen = 0 + for le in lengths: + logprob += log_startprob[h[:, cumlen]] + logprob += log_transmat[ (h[:, cumlen:(cumlen+le-1)], h[:, (cumlen+1):(cumlen+le)]) ].sum(axis=1) + cumlen += le + return logprob + + @njit def np_max_ax_squeeze(arr, axis=0): assert arr.ndim == 2 @@ -144,7 +279,7 @@ def construct_unique_matrix(obs_count, total_count): if total_count.dtype == int: pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T, axis=0 ) else: - pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T.round(decimals=4), axis=0 ) + pairs = np.unique( np.vstack([obs_count[:,s].round(decimals=4), total_count[:,s]]).T.round(decimals=4), axis=0 ) unique_values.append( pairs ) pair_index = {(pairs[i,0], pairs[i,1]):i for i in range(pairs.shape[0])} # construct mapping matrix @@ -154,7 +289,7 @@ def construct_unique_matrix(obs_count, total_count): if total_count.dtype == int: tmpidx = pair_index[(obs_count[i,s], total_count[i,s])] else: - tmpidx = pair_index[(obs_count[i,s], total_count[i,s].round(decimals=4))] + tmpidx = pair_index[(obs_count[i,s].round(decimals=4), total_count[i,s].round(decimals=4))] mat_col[i] = tmpidx mapping_matrices.append( scipy.sparse.csr_matrix((np.ones(len(mat_row)), (mat_row, mat_col) )) ) return unique_values, mapping_matrices @@ -370,7 +505,7 @@ def update_transition_sitewise(log_xi, is_diag=False): def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2, min_estep_weight=0.1): + fun=fit_weighted_NegativeBinomial, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2, min_estep_weight=0.1): """ Attributes ---------- @@ -410,18 +545,21 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] for i in range(n_states): - model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \ + res, n_llf = fun(unique_values[s][idx_nonzero,0], \ np.ones(len(idx_nonzero)).reshape(-1,1), \ weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1], \ - penalty=0) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] - new_alphas[i, s] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + exposure=unique_values[s][idx_nonzero,1], + start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + new_log_mu[i, s] = res[0] + new_alphas[i, s] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \ + # np.ones(len(idx_nonzero)).reshape(-1,1), \ + # weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], \ + # exposure=unique_values[s][idx_nonzero,1], \ + # start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + # new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -449,30 +587,33 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, + start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_alpha=alphas[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_log_mu[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_alphas[:,:] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, + # start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_alpha=alphas[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_log_mu[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_alphas[:,:] = res2[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, tumor_prop, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): + fun=fit_weighted_NegativeBinomial_mix, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): """ Attributes ---------- @@ -513,18 +654,20 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] for i in range(n_states): this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] - model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \ + res, n_llf = fun(unique_values[s][idx_nonzero,0], \ np.ones(len(idx_nonzero)).reshape(-1,1), \ weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s], penalty=0) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] - new_alphas[i, s] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + tumor_prop=this_tp, start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + new_log_mu[i, s] = res[0] + new_alphas[i, s] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \ + # np.ones(len(idx_nonzero)).reshape(-1,1), \ + # weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \ + # tumor_prop=this_tp, + # start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + # new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -558,30 +701,33 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_alpha=alphas[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_log_mu[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_alphas[:,:] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + # start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_alpha=alphas[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_log_mu[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_alphas[:,:] = res2[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ + fun=fit_weighted_BetaBinomial, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): """ Attributes @@ -626,17 +772,21 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1: - model = Weighted_BetaBinom(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ + res, n_llf = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ np.ones(2*len(idx_nonzero)).reshape(-1,1), \ weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ - exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]) ) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] - new_taus[i, s] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), + start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s] ) + new_p_binom[i, s] = res[0] + new_taus[i, s] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ + # np.ones(2*len(idx_nonzero)).reshape(-1,1), \ + # weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ + # exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), \ + # start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s]) + # new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -664,30 +814,33 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, + start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_tau=taus[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_p_binom[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_taus[:,:] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, + # start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_tau=taus[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_p_binom[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_taus[:,:] = res2[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, tumor_prop, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ + fun=fit_weighted_BetaBinomial_mix, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): """ Attributes @@ -738,19 +891,22 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1: this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] assert np.all(this_tp < 1+1e-4) - model = Weighted_BetaBinom_mix(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ + res, n_llf = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ np.ones(2*len(idx_nonzero)).reshape(-1,1), \ weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),\ - tumor_prop=this_tp) - # tumor_prop=tumor_prop ) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] - new_taus[i, s] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + tumor_prop=this_tp, start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s]) + new_p_binom[i, s] = res[0] + new_taus[i, s] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ + # np.ones(2*len(idx_nonzero)).reshape(-1,1), \ + # weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ + # exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),\ + # tumor_prop=this_tp, + # start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s]) + # new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -784,23 +940,26 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_tau=taus[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_p_binom[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_taus[:,:] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + # start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_tau=taus[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_p_binom[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_taus[:,:] = res2[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus @@ -867,7 +1026,7 @@ def update_transition_nophasing(log_xi, is_diag=False): def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, alphas, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): + fun=fit_weighted_NegativeBinomial, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): """ Attributes ---------- @@ -907,18 +1066,21 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] for i in range(n_states): - model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \ + res, n_llf = fun(unique_values[s][idx_nonzero,0], \ np.ones(len(idx_nonzero)).reshape(-1,1), \ weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1], \ - penalty=0) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] - new_alphas[i, s] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + exposure=unique_values[s][idx_nonzero,1], + start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + new_log_mu[i, s] = res[0] + new_alphas[i, s] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \ + # np.ones(len(idx_nonzero)).reshape(-1,1), \ + # weights=tmp[i,idx_nonzero], \ + # exposure=unique_values[s][idx_nonzero,1], \ + # start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + # new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -946,30 +1108,33 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, + start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_alpha=alphas[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_log_mu[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_alphas[:,:] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, + # start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_alpha=alphas[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_log_mu[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_alphas[:,:] = res2[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, alphas, tumor_prop, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): + fun=fit_weighted_NegativeBinomial_mix, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): """ Attributes ---------- @@ -1010,18 +1175,20 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] for i in range(n_states): this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] - model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \ + res, n_llf = fun(unique_values[s][idx_nonzero,0], \ np.ones(len(idx_nonzero)).reshape(-1,1), \ weights=tmp[i,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s], penalty=0) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] - new_alphas[i, s] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + tumor_prop=this_tp, start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + new_log_mu[i, s] = res[0] + new_alphas[i, s] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \ + # np.ones(len(idx_nonzero)).reshape(-1,1), \ + # weights=tmp[i,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \ + # tumor_prop=this_tp, + # start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s]) + # new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -1055,30 +1222,33 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_alpha=alphas[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] - if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_log_mu[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_alphas[:,:] = res[-1] + # if not (start_log_mu is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + # start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_alpha=alphas[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_log_mu[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_alphas[:,:] = res2[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, taus, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ + fun=fit_weighted_BetaBinomial, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): """ Attributes @@ -1123,17 +1293,21 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) if np.sum(tmp[i,idx_nonzero]) >= 0.1: - model = Weighted_BetaBinom(unique_values[s][idx_nonzero,0], \ + res, n_llf = fun(unique_values[s][idx_nonzero,0], \ np.ones(len(idx_nonzero)).reshape(-1,1), \ weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1] ) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] - new_taus[i, s] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + exposure=unique_values[s][idx_nonzero,1], + start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s] ) + new_p_binom[i, s] = res[0] + new_taus[i, s] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \ + # np.ones(len(idx_nonzero)).reshape(-1,1), \ + # weights=tmp[i,idx_nonzero], \ + # exposure=unique_values[s][idx_nonzero,1], + # start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s]) + # new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -1161,23 +1335,26 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, + start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_tau=taus[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_p_binom[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_taus[:,:] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, + # start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_tau=taus[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_p_binom[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_taus[:,:] = res2[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob @@ -1185,7 +1362,7 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, taus, tumor_prop, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ + fun=fit_weighted_BetaBinomial_mix, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): """ Attributes @@ -1236,19 +1413,22 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma if np.sum(tmp[i,idx_nonzero]) >= 0.1: this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] assert np.all(this_tp < 1 + 1e-4) - model = Weighted_BetaBinom_mix(unique_values[s][idx_nonzero,0], \ + res, n_llf = fun(unique_values[s][idx_nonzero,0], \ np.ones(len(idx_nonzero)).reshape(-1,1), \ weights=tmp[i,idx_nonzero], \ exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s] ) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] - new_taus[i, s] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + tumor_prop=this_tp, start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s]) + new_p_binom[i, s] = res[0] + new_taus[i, s] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \ + # np.ones(len(idx_nonzero)).reshape(-1,1), \ + # weights=tmp[i,idx_nonzero], \ + # exposure=unique_values[s][idx_nonzero,1], \ + # tumor_prop=this_tp, + # start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s]) + # new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0] + # new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1] else: exposure = [] y = [] @@ -1282,23 +1462,26 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + start_tau=taus[0,s]) for s,idx_state_posweight in enumerate(state_posweights): l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res.params[l1:l2] - if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] - if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) - new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] - if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_p_binom[idx_state_posweight, s] = res[l1:l2] + if res[-1] > 0: + new_taus[:,:] = res[-1] + # if not (start_p_binom is None): + # res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, + # start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), + # start_tau=taus[0,s]) + # if n_llf2 < n_llf: + # for s,idx_state_posweight in enumerate(state_posweights): + # l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) + # l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + # new_p_binom[idx_state_posweight, s] = res2[l1:l2] + # if res2[-1] > 0: + # new_taus[:,:] = res2[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py index bee9f42..4920fd3 100644 --- a/src/calicost/utils_hmrf.py +++ b/src/calicost/utils_hmrf.py @@ -399,11 +399,14 @@ def reorder_results_merged(res, n_obs): def load_hmrf_last_iteration(filename): allres = dict( np.load(filename, allow_pickle=True) ) r = allres["num_iterations"] - 1 - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} + res = {} + for k,v in allres.items(): + if k.startswith(f"round{r}_"): + res[k.replace(f"round{r}_", "")] = v + # special case for assignment + del res["assignment"] + res['new_assignment'] = allres[f"round{r}_assignment"] + res['prev_assignment'] = allres[f"round{r-1}_assignment"] if "barcodes" in allres.keys(): res["barcodes"] = allres["barcodes"] return res @@ -411,11 +414,14 @@ def load_hmrf_last_iteration(filename): def load_hmrf_given_iteration(filename, r): allres = dict( np.load(filename, allow_pickle=True) ) - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} + res = {} + for k,v in allres.items(): + if k.startswith(f"round{r}_"): + res[k.replace(f"round{r}_", "")] = v + # special case for assignment + del res["assignment"] + res['new_assignment'] = allres[f"round{r}_assignment"] + res['prev_assignment'] = allres[f"round{r-1}_assignment"] if "barcodes" in allres.keys(): res["barcodes"] = allres["barcodes"] return res diff --git a/src/calicost/vi_main.py b/src/calicost/vi_main.py new file mode 100644 index 0000000..9529d52 --- /dev/null +++ b/src/calicost/vi_main.py @@ -0,0 +1,109 @@ +import sys +import numpy as np +import scipy +import pandas as pd +from pathlib import Path +from sklearn.metrics import adjusted_rand_score +from sklearn.cluster import KMeans +import scanpy as sc +import anndata +import logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") +logger = logging.getLogger() +import copy +from pathlib import Path +import functools +import subprocess +from calicost.arg_parse import * +from calicost.hmm_NB_BB_phaseswitch import * +from calicost.utils_distribution_fitting import * +from calicost.utils_hmrf import * +from calicost.hmrf import * +from calicost.phasing import * +from calicost.utils_IO import * +from calicost.find_integer_copynumber import * +from calicost.parse_input import * +from calicost.utils_plotting import * +from calicost.sample_posterior_clone import * +from calicost.hmm_NB_BB_nophasing_float import * + + +def main(configuration_file): + try: + config = read_configuration_file(configuration_file) + except: + config = read_joint_configuration_file(configuration_file) + print("Configurations:") + for k in sorted(list(config.keys())): + print(f"\t{k} : {config[k]}") + + lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ + barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) + + single_tumor_prop[np.isnan(single_tumor_prop)] = 0.05 + + copy_single_X_rdr = copy.copy(single_X[:,0,:]) + copy_single_base_nb_mean = copy.copy(single_base_nb_mean) + single_X[:,0,:] = 0 + single_base_nb_mean[:,:] = 0 + + # # remove spots with tumor purity less than config['tumorprop_threshold'] + # single_X = single_X[:,:,single_tumor_prop > config['tumorprop_threshold']] + # single_base_nb_mean = single_base_nb_mean[:,single_tumor_prop > config['tumorprop_threshold']] + # single_total_bb_RD = single_total_bb_RD[:,single_tumor_prop > config['tumorprop_threshold']] + # barcodes = barcodes[single_tumor_prop > config['tumorprop_threshold']] + # coords = coords[single_tumor_prop > config['tumorprop_threshold'],:] + # sample_ids = sample_ids[single_tumor_prop > config['tumorprop_threshold']] + # adjacency_mat = adjacency_mat[single_tumor_prop > config['tumorprop_threshold'],:][:,single_tumor_prop > config['tumorprop_threshold']] + # smooth_mat = smooth_mat[single_tumor_prop > config['tumorprop_threshold'],:][:,single_tumor_prop > config['tumorprop_threshold']] + # single_tumor_prop = single_tumor_prop[single_tumor_prop > config['tumorprop_threshold']] + + # Run HMRF + for r_hmrf_initialization in range(config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]): + outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}" + if config["tumorprop_file"] is None: + initial_clone_index = rectangle_initialize_initial_clone(coords, config["n_clones"], random_state=r_hmrf_initialization) + else: + initial_clone_index = rectangle_initialize_initial_clone_mix(coords, config["n_clones"], single_tumor_prop, threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization) + + # create directory + p = subprocess.Popen(f"mkdir -p {outdir}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + out,err = p.communicate() + # save clone initialization into npz file + prefix = "vi_gibbs_hightp_allspots" + initial_assignment = np.zeros(single_X.shape[2], dtype=int) + for c,idx in enumerate(initial_clone_index): + initial_assignment[idx] = c + + posterior_clones = np.zeros((single_X.shape[2], config['n_clones'])) + posterior_clones[ (np.arange(single_X.shape[2]), initial_assignment) ] = 1 + + # run HMRF + HMM + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=config["tumorprop_threshold"]) + init_log_mu, init_p_binom = initialization_by_gmm(config['n_states'], np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ + base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params='sp', random_state=config['gmm_random_state'], in_log_space=False, only_minor=False) + + list_posterior_clones, list_cna_states, list_emission_llf, list_log_mu,list_alphas, list_p_binom, list_taus, list_log_startprob, list_elbo, list_h, list_labels = infer_all_v2( + single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, posterior_clones, config['n_states'], + coords, adjacency_mat, config['tumorprop_threshold'], config['spatial_weight'], 'visium', max_iter_outer=20, num_chains=20, burnin=50, sampling_tol=1e-10, temperature=2.0, + hmmclass=hmm_nophasing_float, hmm_params='sp', hmm_t=config['t'], hmm_random_state=config['gmm_random_state'], hmm_max_iter=config['max_iter'], hmm_tol=config['tol'], hmm_num_draws=200, fun=block_gibbs_sampling_labels, + smooth_mat=smooth_mat, init_p_binom=init_p_binom) + + # save results + np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", + list_posterior_clones=list_posterior_clones, + list_cna_states=list_cna_states, + list_emission_llf=list_emission_llf, + list_log_mu=list_log_mu, + list_p_binom=list_p_binom, + list_elbo=list_elbo, + list_h=list_h, + list_labels=list_labels) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) + args = parser.parse_args() + + main(args.configfile) \ No newline at end of file