diff --git a/src/calicost/hmm_NB_BB_nophasing_float.py b/src/calicost/hmm_NB_BB_nophasing_float.py
new file mode 100644
index 0000000..f4f6bcf
--- /dev/null
+++ b/src/calicost/hmm_NB_BB_nophasing_float.py
@@ -0,0 +1,403 @@
+import logging
+import numpy as np
+from numba import njit
+from scipy.stats import norm, multivariate_normal, poisson
+import scipy.special
+from scipy.optimize import minimize
+from scipy.optimize import Bounds
+from sklearn.mixture import GaussianMixture
+from tqdm import trange
+import statsmodels.api as sm
+from statsmodels.base.model import GenericLikelihoodModel
+import copy
+from calicost.utils_distribution_fitting import *
+from calicost.utils_hmm import *
+import networkx as nx
+
+from scipy.special import gammaln
+
+try:
+    import torch
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    from calicost.utils_distribution_fitting_gpu import *
+except ImportError:
+    device = 'cpu'
+
+
+"""
+Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution.
+"""
+
+def betabinom_logpmf_float(x, n, a, b):
+    """
+    Compute beta-binomial logpmf allowing x and n to be float values.
+
+    Attributes
+    ----------
+    x : np.array
+        Number of successes. Allowing float number.
+    n : np.array
+        Number of trials. Allowing float number.
+    a : np.array
+        Shape parameter of the beta distribution. Probability of success * tau, where tau is a transformation of the over-dispersion parameter.
+    b : np.array
+        Shape parameter of the beta distribution. Probability of failure * tau, where tau is a transformation of the over-dispersion parameter.
+    """
+    tau = a + b
+    d_choose_y = gammaln(n + 1) - \
+                 gammaln(x + 1) - \
+                 gammaln(n - x + 1)
+
+    logbeta_numer = gammaln(x + a) + \
+                    gammaln(n - x + b) - \
+                    gammaln(n + tau)
+
+    logbeta_denom = gammaln(a) + \
+                    gammaln(b) - \
+                    gammaln(tau)
+
+    return d_choose_y + logbeta_numer - logbeta_denom
+
+
+############################################################
+# whole inference
+############################################################
+
+class hmm_nophasing_float(object):
+    def __init__(self, params="stmp", t=1-1e-4):
+        """
+        Attributes
+        ----------
+        params : str
+            Codes for parameters that need to be updated. The corresponding parameter can only be updated if it is included in this argument. "s" for start probability; "t" for transition probability; "m" for Negative Binomial RDR signal; "p" for Beta Binomial BAF signal.
+
+        t : float
+            Determine initial self transition probability to be 1-t.
+        """
+        self.params = params
+        self.t = t
+    #
+    @staticmethod
+    def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus):
+        """
+        Attributes
+        ----------
+        X : array, shape (n_observations, n_components, n_spots)
+            Observed expression UMI count and allele frequency UMI count.
+
+        base_nb_mean : array, shape (n_observations, n_spots)
+            Mean expression under diploid state.
+
+        log_mu : array, shape (n_states, n_spots)
+            Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot.
+
+        alphas : array, shape (n_states, n_spots)
+            Over-dispersion of NB distributions in HMM per state per spot.
+
+        total_bb_RD : array, shape (n_observations, n_spots)
+            SNP-covering reads for both REF and ALT across genes along genome.
+
+        p_binom : array, shape (n_states, n_spots)
+            BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot.
+
+        taus : array, shape (n_states, n_spots)
+            Over-dispersion of Beta Binomial distribution in HMM per state per spot.
+        
+        Returns
+        ----------
+        log_emission : array, shape (n_states, n_obs, n_spots)
+            Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
+        """
+        n_obs = X.shape[0]
+        n_comp = X.shape[1]
+        n_spots = X.shape[2]
+        n_states = log_mu.shape[0]
+        # initialize log_emission
+        log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
+        log_emission_baf = np.zeros((n_states, n_obs, n_spots))
+        for i in np.arange(n_states):
+            for s in np.arange(n_spots):
+                # expression from NB distribution
+                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                if len(idx_nonzero_rdr) > 0:
+                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s])
+                    nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
+                    n, p = convert_params(nb_mean, nb_std)
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
+                # AF from BetaBinom distribution
+                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                if len(idx_nonzero_baf) > 0:
+                    log_emission_baf[i, idx_nonzero_baf, s] = betabinom_logpmf_float(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s])
+        return log_emission_rdr, log_emission_baf
+    #
+    @staticmethod
+    def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs):
+        """
+        Attributes
+        ----------
+        X : array, shape (n_observations, n_components, n_spots)
+            Observed expression UMI count and allele frequency UMI count.
+
+        base_nb_mean : array, shape (n_observations, n_spots)
+            Mean expression under diploid state.
+
+        log_mu : array, shape (n_states, n_spots)
+            Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot.
+
+        alphas : array, shape (n_states, n_spots)
+            Over-dispersion of NB distributions in HMM per state per spot.
+
+        total_bb_RD : array, shape (n_observations, n_spots)
+            SNP-covering reads for both REF and ALT across genes along genome.
+
+        p_binom : array, shape (n_states, n_spots)
+            BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot.
+
+        taus : array, shape (n_states, n_spots)
+            Over-dispersion of Beta Binomial distribution in HMM per state per spot.
+        
+        Returns
+        ----------
+        log_emission : array, shape (n_states, n_obs, n_spots)
+            Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
+        """
+        n_obs = X.shape[0]
+        n_comp = X.shape[1]
+        n_spots = X.shape[2]
+        n_states = log_mu.shape[0]
+        # initialize log_emission
+        log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
+        log_emission_baf = np.zeros((n_states, n_obs, n_spots))
+        for i in np.arange(n_states):
+            for s in np.arange(n_spots):
+                # expression from NB distribution
+                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                if len(idx_nonzero_rdr) > 0:
+                    # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s])
+                    nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
+                    n, p = convert_params(nb_mean, nb_std)
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
+                # AF from BetaBinom distribution
+                if ("logmu_shift" in kwargs) and ("sample_length" in kwargs):
+                    this_weighted_tp = []
+                    for c in range(len(kwargs["sample_length"])):
+                        range_s = np.sum(kwargs["sample_length"][:c])
+                        range_t = np.sum(kwargs["sample_length"][:(c+1)])
+                        this_weighted_tp.append( tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) / (tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) + 1 - tumor_prop[range_s:range_t,s]) )
+                    this_weighted_tp = np.concatenate(this_weighted_tp)
+                else:
+                    this_weighted_tp = tumor_prop[:,s]
+                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                if len(idx_nonzero_baf) > 0:
+                    mix_p_A = p_binom[i, s] * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
+                    mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
+                    log_emission_baf[i, idx_nonzero_baf, s] += betabinom_logpmf_float(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s])
+        return log_emission_rdr, log_emission_baf
+    #
+    @staticmethod
+    @njit 
+    def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
+        '''
+        Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
+        Input
+            lengths: sum of lengths = n_observations.
+            log_transmat: n_states * n_states. Transition probability after log transformation.
+            log_startprob: n_states. Start probability after log transformation.
+            log_emission: n_states * n_observations * n_spots. Log probability.
+        Output
+            log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda).
+        '''
+        n_obs = log_emission.shape[1]
+        n_states = log_emission.shape[0]
+        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
+        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        # initialize log_alpha
+        log_alpha = np.zeros((log_emission.shape[0], n_obs))
+        buf = np.zeros(log_emission.shape[0])
+        cumlen = 0
+        for le in lengths:
+            # start prob
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
+            log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1)
+            for t in np.arange(1, le):
+                for j in np.arange(log_emission.shape[0]):
+                    for i in np.arange(log_emission.shape[0]):
+                        buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j]
+                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :])
+            cumlen += le
+        return log_alpha
+    #
+    @staticmethod
+    @njit 
+    def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
+        '''
+        Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
+        Input
+            X: size n_observations * n_components * n_spots.
+            lengths: sum of lengths = n_observations.
+            log_transmat: n_states * n_states. Transition probability after log transformation.
+            log_startprob: n_states. Start probability after log transformation.
+            log_emission: n_states * n_observations * n_spots. Log probability.
+        Output
+            log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda).
+        '''
+        n_obs = log_emission.shape[1]
+        n_states = log_emission.shape[0]
+        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
+        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        # initialize log_beta
+        log_beta = np.zeros((log_emission.shape[0], n_obs))
+        buf = np.zeros(log_emission.shape[0])
+        cumlen = 0
+        for le in lengths:
+            # start prob
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
+            log_beta[:, (cumlen + le - 1)] = 0
+            for t in np.arange(le-2, -1, -1):
+                for i in np.arange(log_emission.shape[0]):
+                    for j in np.arange(log_emission.shape[0]):
+                        buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :])
+                    log_beta[i, (cumlen + t)] = mylogsumexp(buf)
+            cumlen += le
+        return log_beta
+
+    #
+    def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, \
+        fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \
+        is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs):
+        '''
+        Input
+            X: size n_observations * n_components * n_spots.
+            lengths: sum of lengths = n_observations.
+            base_nb_mean: size of n_observations * n_spots.
+            In NB-BetaBinom model, n_components = 2
+        Intermediate
+            log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
+            alpha: size of n_states. Dispersioon parameter of each HMM state.
+        '''
+        n_obs = X.shape[0]
+        n_comp = X.shape[1]
+        n_spots = X.shape[2]
+        assert n_comp == 2
+        # initialize NB logmean shift and BetaBinom prob
+        log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu
+        p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom
+        # initialize (inverse of) dispersion param in NB and BetaBinom
+        alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
+        # initialize start probability and emission probability
+        log_startprob = np.log( np.ones(n_states) / n_states )
+        if n_states > 1:
+            transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1)
+            np.fill_diagonal(transmat, self.t)
+            log_transmat = np.log(transmat)
+        else:
+            log_transmat = np.zeros((1,1))
+        # initialize log_gamma
+        log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None
+        # degenerative case of constructing a list of unique (observed count, expected count) values, and mapping count per bin to the index in the unique list
+        unique_values_nb = [np.vstack([X[:,0,s], base_nb_mean[:,s]]).T for s in range(n_spots)]
+        mapping_matrices_nb = [scipy.sparse.csr_matrix((np.ones(n_obs), (np.arange(n_obs), np.arange(n_obs)) ))]
+        unique_values_bb = [np.vstack([X[:,1,s], total_bb_RD[:,s]]).T for s in range(n_spots)]
+        mapping_matrices_bb = [scipy.sparse.csr_matrix((np.ones(n_obs), (np.arange(n_obs), np.arange(n_obs)) ))]
+        # EM algorithm
+        for r in trange(max_iter):
+            # E step
+            if tumor_prop is None:
+                log_emission_rdr, log_emission_baf = hmm_nophasing_float.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+                log_emission = log_emission_rdr + log_emission_baf
+            else:
+                # compute mu as adjusted RDR
+                if ((not log_gamma is None) or (r > 0)) and ("m" in self.params):
+                    logmu_shift = []
+                    for c in range(len(kwargs["sample_length"])):
+                        this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states
+                        logmu_shift.append( scipy.special.logsumexp(log_mu[this_pred_cnv,:] + np.log(kwargs["lambd"]).reshape(-1,1), axis=0) )
+                    logmu_shift = np.vstack(logmu_shift)
+                    log_emission_rdr, log_emission_baf = hmm_nophasing_float.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, logmu_shift=logmu_shift, sample_length=kwargs["sample_length"])
+                else:
+                    log_emission_rdr, log_emission_baf = hmm_nophasing_float.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop)
+                log_emission = log_emission_rdr + log_emission_baf
+            log_alpha = hmm_nophasing_float.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+            log_beta = hmm_nophasing_float.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+            log_gamma = compute_posterior_obs(log_alpha, log_beta)
+            log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission)
+            # M step
+            if "s" in self.params:
+                new_log_startprob = update_startprob_nophasing(lengths, log_gamma)
+                new_log_startprob = new_log_startprob.flatten()
+            else:
+                new_log_startprob = log_startprob
+            if "t" in self.params:
+                new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag)
+            else:
+                new_log_transmat = log_transmat
+            if "m" in self.params:
+                if tumor_prop is None:
+                    if device == 'cpu':
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    else:
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, 
+                            fun=fit_weighted_NegativeBinomial_gpu, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                else:
+                    if device == 'cpu':
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    else:
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
+                            fun=fit_weighted_NegativeBinomial_mix_gpu, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+            else:
+                new_log_mu = log_mu
+                new_alphas = alphas
+            if "p" in self.params:
+                if tumor_prop is None:
+                    if device == 'cpu':
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    else:
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, 
+                            fun=fit_weighted_BetaBinomial_gpu, start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                else:
+                    # compute mu as adjusted RDR
+                    if ("m" in self.params):
+                        mu = []
+                        for c in range(len(kwargs["sample_length"])):
+                            this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states
+                            mu.append( np.exp(new_log_mu[this_pred_cnv,:]) / np.sum(np.exp(new_log_mu[this_pred_cnv,:]) * kwargs["lambd"].reshape(-1,1), axis=0, keepdims=True) )
+                        mu = np.vstack(mu)
+                        weighted_tp = (tumor_prop * mu) / (tumor_prop * mu + 1 - tumor_prop)
+                    else:
+                        weighted_tp = tumor_prop
+                    if device == 'cpu':
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    else:
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \
+                            fun=fit_weighted_BetaBinomial_mix_gpu, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+            else:
+                new_p_binom = p_binom
+                new_taus = taus
+            # check convergence
+            print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \
+                np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \
+                np.mean(np.abs(new_log_mu - log_mu)),\
+                np.mean(np.abs(new_p_binom - p_binom)) )
+            print( np.hstack([new_log_mu, new_p_binom]) )
+            if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \
+                np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol:
+                break
+            log_startprob = new_log_startprob
+            log_transmat = new_log_transmat
+            log_mu = new_log_mu
+            alphas = new_alphas
+            p_binom = new_p_binom
+            taus = new_taus
+        return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma
+
+
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index d5a9145..1763272 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -14,6 +14,14 @@
 from calicost.utils_hmm import *
 import networkx as nx
 
+try:
+    import torch
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    from calicost.utils_distribution_fitting_gpu import *
+except ImportError:
+    device = 'cpu'
+
+
 """
 Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution.
 """
@@ -294,18 +302,32 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
                 new_log_transmat = log_transmat
             if "m" in self.params:
                 if tumor_prop is None:
-                    new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    if device == 'cpu':
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    else:
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, 
+                            fun=fit_weighted_NegativeBinomial_gpu, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
                 else:
-                    new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    if device == 'cpu':
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    else:
+                        new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
+                            fun=fit_weighted_NegativeBinomial_mix_gpu, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
             if "p" in self.params:
                 if tumor_prop is None:
-                    new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    if device == 'cpu':
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    else:
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, 
+                            fun=fit_weighted_BetaBinomial_gpu, start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
                 else:
                     # compute mu as adjusted RDR
                     if ("m" in self.params):
@@ -317,8 +339,12 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
                         weighted_tp = (tumor_prop * mu) / (tumor_prop * mu + 1 - tumor_prop)
                     else:
                         weighted_tp = tumor_prop
-                    new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    if device == 'cpu':
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    else:
+                        new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \
+                            fun=fit_weighted_BetaBinomial_mix_gpu, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
             else:
                 new_p_binom = p_binom
                 new_taus = taus
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 630651f..23fa956 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -17,6 +17,14 @@
 import networkx as nx
 
 
+try:
+    import torch
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    from calicost.utils_distribution_fitting_gpu import *
+except ImportError:
+    device = 'cpu'
+
+
 ############################################################
 # whole inference
 ############################################################
@@ -285,24 +293,40 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
             else:
                 new_log_transmat = log_transmat
             if "m" in self.params:
-                # new_log_mu, new_alphas = update_emission_params_nb_sitewise(X[:,0,:], log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \
-                #     fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
                 if tumor_prop is None:
-                    new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    if device == 'cpu':
+                        new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    else:
+                        new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, 
+                            fun=fit_weighted_NegativeBinomial_gpu, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
                 else:
-                    new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    if device == 'cpu':
+                        new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \
+                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    else:
+                        new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \
+                            fun=fit_weighted_NegativeBinomial_mix_gpu, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
             if "p" in self.params:
                 if tumor_prop is None:
-                    new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    if device == 'cpu':
+                        new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, start_p_binom=p_binom, \
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    else:
+                        new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, 
+                            fun=fit_weighted_BetaBinomial_gpu, start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
                 else:
-                    new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    if device == 'cpu':
+                        new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \
+                            fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    else:
+                        new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \
+                            fun=fit_weighted_BetaBinomial_mix_gpu, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
             else:
                 new_p_binom = p_binom
                 new_taus = taus
@@ -546,10 +570,12 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total
         tmp = np.log10(1 - t)
         np.savez(f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz", \
                 new_log_mu=new_log_mu, new_alphas=new_alphas, new_p_binom=new_p_binom, new_taus=new_taus, \
-                new_log_startprob=new_log_startprob, new_log_transmat=new_log_transmat, log_gamma=log_gamma, pred_cnv=pred_cnv, llf=llf)
+                new_log_startprob=new_log_startprob, new_log_transmat=new_log_transmat, log_gamma=log_gamma, pred_cnv=pred_cnv, llf=llf,
+                log_emission=log_emission, log_alpha=log_alpha, log_beta=log_beta)
     else:
         res = {"new_log_mu":new_log_mu, "new_alphas":new_alphas, "new_p_binom":new_p_binom, "new_taus":new_taus, \
-            "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf}
+            "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf,
+            "log_emission":log_emission, "log_alpha":log_alpha, "log_beta":log_beta}
         return res
 
 
diff --git a/src/calicost/sample_posterior_clone.py b/src/calicost/sample_posterior_clone.py
new file mode 100644
index 0000000..45e069f
--- /dev/null
+++ b/src/calicost/sample_posterior_clone.py
@@ -0,0 +1,725 @@
+import sys
+import copy
+import numpy as np
+import scipy
+import pandas as pd
+from matplotlib import pyplot as plt
+import seaborn
+
+import graph_tool.all as gt
+
+from calicost.utils_hmm import *
+from calicost.hmm_NB_BB_phaseswitch import *
+from calicost.utils_phase_switch import *
+from calicost.utils_hmm import multinoulli_sampler_high_dimensional
+
+
+def compute_potential(single_llf, labels, spatial_weight, adjacency_mat):
+    """
+    Attributes
+    ----------
+    single_llf : np.array, size (num_spots, num_labels)
+        The log-likelihood of each pixel being in each label.
+    labels : np.array, size (num_spots)
+        The label of each pixel.
+    spatial_weight : float
+        The weight of the spatial coherence term.
+    adjacency_mat : np.array, size (num_spots, num_spots)
+        The adjacency matrix of the spatial graph. Note this is a symmetric matrix with 0 diagonal.
+    """
+    N = len(labels)
+    # emission part
+    neg_potential = single_llf[(np.arange(N), labels)].sum()
+    # spatial adjacency part
+    # make sure diagonal is 0
+    if not np.all(adjacency_mat.diagonal() == 0):
+        adjacency_mat.setdiag(0)
+    # neg_potential += 0.5 * spatial_weight * np.sum(adjacency_mat * (labels[:, None] == labels[None, :]))
+    idx1, idx2 = adjacency_mat.nonzero()
+    neg_potential += spatial_weight * np.sum(labels[idx1] == labels[idx2])
+    return neg_potential / N
+
+
+def block_gibbs_sampling_labels(emission_llf, log_clonesize, adjacency_mat, block_ids, spatial_weight, num_draws, random_state, temperature, initial_logprob=None):
+    """
+    Attributes
+    ----------
+    emission_llf : np.array, size (num_spots, n_clones)
+        The log-likelihood of each pixel being in each clone label.
+    log_clonesize : np.array, size (num_spots, n_clones)
+        The prior probability of selecting each clone label for each spot.
+    adjacency_mat : np.array, size (num_spots, num_spots)
+        The adjacency matrix of the spatial graph. Note this is a symmetric matrix with 0 diagonal.
+    block_ids : np.array, size (num_spots)
+        The block id of each spot. Each block is updated together in a block Gibbs sampling step.
+    spatial_weight : float
+        The weight of the spatial coherence term.
+    num_draws : int
+        The number of draws to draw.
+    random_state : int
+        The random seed.
+    initial_logprob : np.array, size (num_spots, n_clones)
+        Initial log probability to sample clone labels.
+    """
+    np.random.seed(random_state)
+    # number of spots, blocks, and clones
+    N, n_clones = emission_llf.shape
+    n_blocks = len(np.unique(block_ids))
+
+    # list of labels of each sampling time point
+    # with initial label by random sampling
+    if not initial_logprob is None:
+        list_labels = [ multinoulli_sampler_high_dimensional(initial_logprob) ]
+    else:
+        list_labels = [ multinoulli_sampler_high_dimensional(log_clonesize) ]
+    list_potentials = [ compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) ]
+    
+    # block Gibbs sampling
+    # prepare sub-adjacency matrix where rows correspond to one block, columns correspond to the other
+    A_sub_list = [adjacency_mat[block_ids==b, :][:, block_ids!=b] for b in range(n_blocks)]
+    for iter in range(num_draws):
+        this_l = np.zeros(N, dtype=int)
+        for b in range(n_blocks):
+            # for spots with block_id of b, compute the posterior distribution
+            post = copy.copy((emission_llf + log_clonesize)[block_ids==b, :])
+            for c in range(n_clones):
+                post[:,c] += spatial_weight * A_sub_list[b] @ (list_labels[-1][block_ids!=b] == c)
+
+            post = post / temperature
+            post -= scipy.special.logsumexp(post, axis=1, keepdims=True)
+            this_l[block_ids==b] = multinoulli_sampler_high_dimensional(post)
+        
+        list_labels.append(this_l)
+        # potential of the new state
+        list_potentials.append( compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) )
+    
+    return list_labels, list_potentials
+
+
+def update_bond(label, adjacency_mat, prob):
+    """
+    Bond configuration of the S-W algorithm.
+    A bond is created between two adjacent nodes with probability prob (prob=1 - exp(-beta / T)) if the two nodes have the same label.
+    There is no bond between two nodes with different labels.
+
+    Attributes
+    ----------
+    label : numpy.ndarray
+        Label of each node.
+
+    adjacency_mat : scipy.sparse.csr_matrix
+        Adjacency matrix of the graph.
+
+    prob : float
+        Probability of creating a bond between two adjacent nodes.
+
+    Returns
+    -------
+    bond : scipy.sparse.csr_matrix
+        Bond matrix of the graph.
+
+    connected_components : np.ndarray
+        Label for each node to belong to which connected components from the graph with original nodes and sampled bonds.
+    """
+    # adjacency edges from the adjacency matrix
+    edges = scipy.sparse.triu(adjacency_mat, k=1).nonzero()
+    # indicator of the same label
+    same_label = (label[edges[0]] == label[edges[1]])
+    edges = (edges[0][same_label], edges[1][same_label])
+    # create bonds with probability prob
+    sample_bond = (np.random.rand(len(edges[0])) < prob)
+    edges = (edges[0][sample_bond], edges[1][sample_bond])
+    # create bond matrix
+    bond = scipy.sparse.csr_matrix((np.ones(len(edges[0]), dtype=bool), edges), shape=adjacency_mat.shape)
+    bond = bond + bond.T
+
+    # connected components by graph_tool
+    es = np.array( bond.nonzero() ).T
+    g = gt.Graph(adjacency_mat.shape[0])
+    g.add_edge_list(es)
+    comp, hist = gt.label_components(g)
+    connected_components = comp.a
+
+    return bond, connected_components
+
+
+# def update_label(emission_llf, log_clonesize, connected_components):
+#     """
+#     Label of each connected component is sampled randomly from the posterior distribution, which is proportional to exp(emission_llf + log_clonesize).
+
+#     Attributes
+#     ----------
+#     emission_llf : np.array, size (num_spots, n_clones)
+#         The log-likelihood of each pixel being in each clone label.
+#     log_clonesize : np.array, size (num_spots, n_clones)
+#         The prior probability of selecting each clone label for each spot.
+#     connected_components : np.ndarray
+#         Index of connected components for each node. Ranges from 0 to n_components - 1.
+    
+#     Returns
+#     -------
+#     labels : np.ndarray
+#         Sampled label of each node.
+#     """
+#     # number of connected components
+#     n_components = np.max(connected_components) + 1
+#     assert n_components == len(np.unique(connected_components)), "Connected components are not consecutive."
+
+#     # number of clone labels
+#     n_labels = emission_llf.shape[1]
+
+#     # compute posterior distribution of each connected component by taking the average of related nodes
+#     post_per_node = emission_llf + log_clonesize
+#     post_per_component = np.zeros((n_components, n_labels))
+#     for i in range(n_components):
+#         post_per_component[i] = np.mean(post_per_node[connected_components == i], axis=0)
+#     # normalize
+#     post_per_component -= scipy.special.logsumexp(post_per_component, axis=1, keepdims=True)
+#     # sample label per connected component
+#     l_per_component = multinoulli_sampler_high_dimensional(post_per_component)
+#     # assign label to each node
+#     labels = l_per_component[connected_components]
+
+#     return labels
+
+
+def update_label(emission_llf, log_clonesize, connected_components, temperature):
+    """
+    Label of each connected component is sampled randomly from the posterior distribution, which is proportional to exp(emission_llf + log_clonesize).
+
+    Attributes
+    ----------
+    emission_llf : np.array, size (num_spots, n_clones)
+        The log-likelihood of each pixel being in each clone label.
+    log_clonesize : np.array, size (num_spots, n_clones)
+        The prior probability of selecting each clone label for each spot.
+    connected_components : np.ndarray
+        Index of connected components for each node. Ranges from 0 to n_components - 1.
+    temperature : float
+        Temperature parameter to adjust the log emission probability.
+    
+    Returns
+    -------
+    labels : np.ndarray
+        Sampled label of each node.
+    """
+    # number of connected components
+    n_components = np.max(connected_components) + 1
+    assert n_components == len(np.unique(connected_components)), "Connected components are not consecutive."
+
+    # number of clone labels
+    n_labels = emission_llf.shape[1]
+
+    # compute posterior distribution of each connected component by taking the average of related nodes
+    post_per_node = emission_llf + log_clonesize
+    # convert connected_components into one-hot encoding scipy sparse matrix
+    onehot = scipy.sparse.csr_matrix((np.ones(len(connected_components), dtype=int), (np.arange(len(connected_components)), connected_components)), 
+                                    shape=(len(connected_components), n_components))
+    # take sum of post_per_node for each connected component
+    post_per_component = onehot.T.dot(post_per_node)
+    # divided by the number of nodes in each connected component
+    post_per_component = post_per_component / onehot.sum(axis=0).T.A
+    # scale by temperature
+    post_per_component = post_per_component / temperature
+    # normalize
+    post_per_component -= scipy.special.logsumexp(post_per_component, axis=1, keepdims=True)
+    # sample label per connected component
+    l_per_component = multinoulli_sampler_high_dimensional(post_per_component)
+    # assign label to each node
+    labels = l_per_component[connected_components]
+
+    return labels
+
+
+def SW_sampling_labels(emission_llf, log_clonesize, adjacency_mat, block_ids, spatial_weight, num_draws, random_state, temperature, initial_logprob=None):
+    """
+    Sampling num_draws of label configurations using the S-W algorithm (a specific MCMC method).
+
+    Attributes
+    ----------
+    emission_llf : np.array, size (num_spots, n_clones)
+        The log-likelihood of each pixel being in each clone label.
+    log_clonesize : np.array, size (num_spots, n_clones)
+        The prior probability of selecting each clone label for each spot.
+    adjacency_mat : scipy.sparse.csr_matrix
+        Adjacency matrix of the graph.
+    spatial_weight : float
+        Spatial weight parameter.
+    num_draws : int
+        Number of label configurations to draw in this Markov chain.
+    random_state : int
+        Random seed.
+    temperature : float
+        Temperature parameter. Temperature and spatial_weight are used to determine the bond probability.
+    initial_logprob : np.array, size (num_spots, n_clones), optional
+
+    Returns
+    -------
+    list_labels : list of np.ndarray
+        List of sampled label configurations across num_draws iterations.
+    """
+    # set random seed
+    np.random.seed(random_state)
+
+    # initialization
+    n_spots, n_clones = emission_llf.shape
+    if not initial_logprob is None:
+        list_labels = [ multinoulli_sampler_high_dimensional(initial_logprob) ]
+    else:
+        list_labels = [ multinoulli_sampler_high_dimensional(log_clonesize) ]
+    list_potentials = [ compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) ]
+
+    # sample across num_draws iterations
+    for iter in range(num_draws):
+        # update bond
+        prob = 1 - np.exp(-spatial_weight / temperature)
+        bond, connected_components = update_bond(list_labels[-1], adjacency_mat, prob)
+        # update label
+        list_labels.append( update_label(emission_llf, log_clonesize, connected_components, temperature) )
+        # update potential
+        list_potentials.append( compute_potential(emission_llf + log_clonesize, list_labels[-1], spatial_weight, adjacency_mat) )
+
+    return list_labels, list_potentials
+
+
+def posterior_distribution_clone_labels(emission_llf, log_clonesize, adjacency_mat, coords, spatial_weight, num_chains, burnin, platform='visium', fun=block_gibbs_sampling_labels, temperature=1.0):
+    n_spots, n_clones = emission_llf.shape
+    list_labels = []
+    list_potentials = []
+
+    # if block Gibbs sampling
+    if platform == 'visium':
+        block_ids = coords[:,1]%3
+    else:
+        block_ids = (coords[:,0] + coords[:,1]) % 2 
+
+    # number of chains (starting points)
+    for r in range(num_chains):
+        this_list_labels, this_list_potential = fun(emission_llf, log_clonesize, adjacency_mat, block_ids, spatial_weight, num_draws=150, random_state=r, temperature=temperature)
+        list_labels.append(this_list_labels)
+        list_potentials.append(this_list_potential)
+
+    # stack lists
+    list_labels = np.stack(list_labels)
+    list_potentials = np.stack(list_potentials)
+
+    # make a plot of the potential of the first chain
+    for r in range(num_chains):
+        plt.plot(list_potentials[r], alpha=0.5)
+    plt.xlabel('iteration')
+    plt.ylabel('potential')
+    plt.show()
+
+    # remove burnin
+    list_labels = list_labels[:,burnin:, :]
+    list_potentials = list_potentials[:, burnin:]
+
+    # probability of getting each clone at each spot
+    clone_prob = []
+    for r in range(num_chains):
+        this_clone_prob = np.array([ np.bincount(l, minlength=n_clones) for l in list_labels[r].T ])
+        this_clone_prob = this_clone_prob / list_labels[r].shape[0]
+        clone_prob.append( this_clone_prob )
+
+    # aggregate across chains to get the final posterior distribution per spot
+    agg_clone_prob = np.mean(clone_prob, axis=0)
+    return agg_clone_prob, list_labels.reshape((-1, n_spots)), list_potentials.flatten()
+
+
+def infer_all(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_label, n_states,
+    coords, adjacency_mat, tumorprop_threshold, spatial_weight, platform, max_iter_outer, num_chains, burnin, sampling_tol, temperature,
+    hmmclass, hmm_params, hmm_t, hmm_random_state, hmm_max_iter, hmm_tol, hmm_num_draws, fun=block_gibbs_sampling_labels,
+    smooth_mat=None, sample_ids=None, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,
+    fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True):
+
+    n_clones = len(np.unique(initial_clone_label))
+    n_obs, n_spots = single_total_bb_RD.shape
+
+    # aggregated counts according to smooth_mat
+    if not smooth_mat is None:
+        agg_single_X = np.stack([ single_X[i,:,:] @ smooth_mat for i in range(single_X.shape[0]) ])
+        agg_single_base_nb_mean = single_base_nb_mean @ smooth_mat
+        agg_single_total_bb_RD = single_total_bb_RD @ smooth_mat
+        agg_single_tumor_prop = (single_tumor_prop @ smooth_mat) / np.sum(smooth_mat.A, axis=0)
+    else:
+        # aggregated matrices are the same as their original ones
+        agg_single_X = single_X
+        agg_single_base_nb_mean = single_base_nb_mean
+        agg_single_total_bb_RD = single_total_bb_RD
+        agg_single_tumor_prop = single_tumor_prop
+
+    # make a fake log_sitewise_transmat because some function calls have that parameter for formating purpose, but it's not actually used
+    log_sitewise_transmat = np.zeros(n_obs)
+
+    # the initial posterior distribution of clone labels is a binary matrix derived from initial_clone_label
+    posterior_clones = np.zeros((n_spots, n_clones))
+    posterior_clones[ (np.arange(n_spots), initial_clone_label) ] = 1
+    log_clonesize = np.ones(n_clones) * np.log(1.0 / n_clones)
+    log_clonesize = np.repeat( log_clonesize[None,:], single_X.shape[2], axis=0 )
+
+    # initial emission parameters
+    last_log_mu = init_log_mu
+    last_alphas = init_alphas
+    last_p_binom = init_p_binom
+    last_taus = init_taus
+
+    list_posterior_clones = [posterior_clones]
+    list_cna_states = []
+    list_emission_llf = []
+    list_log_mu = []
+    list_p_binom = []
+    list_elbo = []
+
+    for r in range(max_iter_outer):
+        ##### Fit HMM using posterior_clones #####
+        # aggregate into pseudobulk for each clone weighted by posterior_clones
+        X = (single_X[:,:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :])
+        base_nb_mean = single_base_nb_mean[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]
+        total_bb_RD = single_total_bb_RD[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]
+        tumor_prop = single_tumor_prop[single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] / posterior_clones[single_tumor_prop>tumorprop_threshold, :].sum(axis=0)
+        
+        # initialize parameters
+        if (last_log_mu is None and "m" in hmm_params) or (last_p_binom is None and "p" in hmm_params):
+            tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
+                base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), hmm_params, random_state=hmm_random_state, in_log_space=False, only_minor=False)
+            
+            last_log_mu = tmp_log_mu if init_log_mu is None and "m" in hmm_params else None
+            last_p_binom = tmp_p_binom if init_p_binom is None and "p" in hmm_params else None
+
+        # fit HMM
+        res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \
+                        base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1), \
+                        hmmclass=hmmclass, params=hmm_params, t=hmm_t, random_state=hmm_random_state, \
+                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
+                        is_diag=True, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=hmm_max_iter, tol=hmm_tol)
+        # estimate log emission probability for each spot belong to each clone based on the fitted HMM res
+        list_h = FFBS_faster(hmm_num_draws, res['log_alpha'], res['log_gamma'], res['new_log_transmat'], np.tile(lengths, X.shape[2]))
+        # emission probability of each spot under the posterior distribution of hidden states of each clone
+        log_prob_rdr, log_prob_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([agg_single_X[:,0,:].flatten("F"), agg_single_X[:,1,:].flatten("F")]).T.reshape(-1,2,1), 
+                                                                                                    agg_single_base_nb_mean.flatten("F").reshape(-1,1), res['new_log_mu'], res['new_alphas'], 
+                                                                                                    agg_single_total_bb_RD.flatten("F").reshape(-1,1), res['new_p_binom'], res['new_taus'], 
+                                                                                                    np.repeat(agg_single_tumor_prop, agg_single_X.shape[0]).reshape(-1,1))
+        log_per_state_emission = log_prob_rdr + log_prob_baf
+
+        emission_llf = np.zeros((agg_single_X.shape[2], n_clones))
+        for s in range(agg_single_X.shape[2]):
+            this_log_emission = log_per_state_emission[:,(s*n_obs):(s*n_obs+n_obs), 0]
+            for c in range(n_clones):
+                sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)]
+                # emission probability
+                emission_llf[s,c] = this_log_emission[(sampled_h, np.arange(n_obs))].sum()
+                # take average
+                emission_llf[s,c] /= hmm_num_draws
+
+        # save results
+        list_emission_llf.append(emission_llf)
+        list_log_mu.append(res['new_log_mu'])
+        list_p_binom.append(res['new_p_binom'])
+        list_cna_states.append( np.stack([res['pred_cnv'][(i*n_obs):(i*n_obs+n_obs)] for i in range(n_clones)]) )
+
+        # update last_log_mu, last_p_binom, last_alphas, last_taus
+        last_log_mu = res['new_log_mu']
+        last_p_binom = res['new_p_binom']
+        last_alphas = res['new_alphas']
+        last_taus = res['new_taus']
+
+        ##### Infer clone labels using the estimated log emission probability #####
+        posterior_clones, list_labels, list_potentials = posterior_distribution_clone_labels(emission_llf, log_clonesize, adjacency_mat, coords, spatial_weight, num_chains, burnin, platform=platform, temperature=temperature, fun=fun)
+        list_posterior_clones.append(posterior_clones)
+
+        plot_posterior_clones_single(list_posterior_clones, coords, len(list_posterior_clones)-1, sample_ids)
+        plt.show()
+        # update log clone size
+        log_clonesize = np.mean(posterior_clones, axis=0)
+        log_clonesize = np.repeat( np.log(log_clonesize)[None,:], single_X.shape[2], axis=0 )
+
+        # update elbo
+        prior_log_prob = np.zeros(n_clones)
+        for c in range(n_clones):
+            sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)]
+            prior_log_prob[c] = compute_state_prior_prob_highdim(sampled_h, res['new_log_startprob'], res['new_log_transmat'], lengths).mean()
+
+        eventual_llf = [compute_potential(emission_llf + log_clonesize, l, spatial_weight, adjacency_mat) + prior_log_prob.sum() for l in list_labels]
+        list_elbo.append( np.mean(eventual_llf) )
+
+        # compare the difference between the current and previous posterior_clones
+        if np.square(list_posterior_clones[-1] - list_posterior_clones[-2]).mean() < sampling_tol:
+            break
+
+        ##### temperature annealing #####
+        temperature = max(1.0, 0.95 * temperature)
+        # if r > 2:
+        #     temperature = max(1.0, temperature-2)
+
+    return list_posterior_clones, list_cna_states, list_emission_llf, list_log_mu, list_p_binom, list_elbo
+
+
+
+def infer_all_v2(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_label, n_states,
+    coords, adjacency_mat, tumorprop_threshold, spatial_weight, platform, max_iter_outer, num_chains, burnin, sampling_tol, temperature,
+    hmmclass, hmm_params, hmm_t, hmm_random_state, hmm_max_iter, hmm_tol, hmm_num_draws, fun=block_gibbs_sampling_labels,
+    smooth_mat=None, sample_ids=None, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,
+    fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True):
+    """
+    initial_clone_label : one hot encoding of initial clones, or probability of each spot being in each clone.
+    """
+    n_clones = initial_clone_label.shape[1]
+    n_obs, n_spots = single_total_bb_RD.shape
+
+    # aggregated counts according to smooth_mat
+    if not smooth_mat is None:
+        agg_single_X = np.stack([ single_X[i,:,:] @ smooth_mat for i in range(single_X.shape[0]) ])
+        agg_single_base_nb_mean = single_base_nb_mean @ smooth_mat
+        agg_single_total_bb_RD = single_total_bb_RD @ smooth_mat
+        agg_single_tumor_prop = (single_tumor_prop @ smooth_mat) / np.sum(smooth_mat.A, axis=0)
+    else:
+        # aggregated matrices are the same as their original ones
+        agg_single_X = single_X
+        agg_single_base_nb_mean = single_base_nb_mean
+        agg_single_total_bb_RD = single_total_bb_RD
+        agg_single_tumor_prop = single_tumor_prop
+
+    # make a fake log_sitewise_transmat because some function calls have that parameter for formating purpose, but it's not actually used
+    log_sitewise_transmat = np.zeros(n_obs)
+
+    # the initial posterior distribution of clone labels is a binary matrix derived from initial_clone_label
+    posterior_clones = initial_clone_label
+    log_clonesize = np.ones(n_clones) * np.log(1.0 / n_clones)
+    log_clonesize = np.repeat( log_clonesize[None,:], single_X.shape[2], axis=0 )
+
+    # initial emission parameters
+    last_log_mu = init_log_mu
+    last_alphas = init_alphas
+    last_p_binom = init_p_binom
+    last_taus = init_taus
+
+    list_posterior_clones = [initial_clone_label]
+    list_cna_states = []
+    list_emission_llf = []
+    list_log_mu = []
+    list_alphas = []
+    list_p_binom = []
+    list_taus = []
+    list_log_startprob = []
+    list_elbo = []
+
+    for r in range(max_iter_outer):
+        ##### Fit HMM using posterior_clones #####
+        # aggregate into pseudobulk for each clone weighted by posterior_clones
+        X = (single_X[:,:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :])
+        base_nb_mean = single_base_nb_mean[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]
+        total_bb_RD = single_total_bb_RD[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]
+        tumor_prop = single_tumor_prop[single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :] / posterior_clones[single_tumor_prop>tumorprop_threshold, :].sum(axis=0)
+        
+        # initialize parameters
+        if (last_log_mu is None and "m" in hmm_params) or (last_p_binom is None and "p" in hmm_params):
+            tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
+                base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), hmm_params, random_state=hmm_random_state, in_log_space=False, only_minor=False)
+            
+            last_log_mu = tmp_log_mu if init_log_mu is None and "m" in hmm_params else None
+            last_p_binom = tmp_p_binom if init_p_binom is None and "p" in hmm_params else None
+
+        # fit HMM
+        res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \
+                        base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1), \
+                        hmmclass=hmmclass, params=hmm_params, t=hmm_t, random_state=hmm_random_state, \
+                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
+                        is_diag=True, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=hmm_max_iter, tol=hmm_tol)
+        # estimate log emission probability for each spot belong to each clone based on the fitted HMM res
+        list_h = FFBS_faster(hmm_num_draws, res['log_alpha'], res['log_gamma'], res['new_log_transmat'], np.tile(lengths, X.shape[2]))
+        # emission probability of each spot under the posterior distribution of hidden states of each clone
+        log_prob_rdr, log_prob_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([agg_single_X[:,0,:].flatten("F"), agg_single_X[:,1,:].flatten("F")]).T.reshape(-1,2,1), 
+                                                                                                    agg_single_base_nb_mean.flatten("F").reshape(-1,1), res['new_log_mu'], res['new_alphas'], 
+                                                                                                    agg_single_total_bb_RD.flatten("F").reshape(-1,1), res['new_p_binom'], res['new_taus'], 
+                                                                                                    np.repeat(agg_single_tumor_prop, agg_single_X.shape[0]).reshape(-1,1))
+        log_per_state_emission = log_prob_rdr + log_prob_baf
+
+        emission_llf = np.zeros((agg_single_X.shape[2], n_clones))
+        for s in range(agg_single_X.shape[2]):
+            this_log_emission = log_per_state_emission[:,(s*n_obs):(s*n_obs+n_obs), 0]
+            for c in range(n_clones):
+                sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)]
+                # emission probability
+                emission_llf[s,c] = this_log_emission[(sampled_h, np.arange(n_obs))].sum()
+                # take average
+                emission_llf[s,c] /= hmm_num_draws
+
+        # save results
+        list_emission_llf.append( emission_llf )
+        list_log_mu.append(res['new_log_mu'])
+        list_alphas.append(res['new_alphas'])
+        list_p_binom.append(res['new_p_binom'])
+        list_taus.append(res['new_taus'])
+        list_log_startprob.append(res['new_log_startprob'])
+        list_cna_states.append( np.stack([res['pred_cnv'][(i*n_obs):(i*n_obs+n_obs)] for i in range(n_clones)]) )
+
+        # update last_log_mu, last_p_binom, last_alphas, last_taus
+        last_log_mu = res['new_log_mu']
+        last_p_binom = res['new_p_binom']
+        last_alphas = res['new_alphas']
+        last_taus = res['new_taus']
+
+        ##### Infer clone labels using the estimated log emission probability #####
+        posterior_clones, list_labels, list_potentials = posterior_distribution_clone_labels(emission_llf, log_clonesize, adjacency_mat, coords, spatial_weight, num_chains, burnin, platform=platform, temperature=temperature, fun=fun)
+        list_posterior_clones.append(posterior_clones)
+
+        plot_posterior_clones_single(list_posterior_clones, coords, len(list_posterior_clones)-1, sample_ids)
+        plt.show()
+        # update log clone size
+        log_clonesize = np.mean(posterior_clones, axis=0)
+        log_clonesize = np.repeat( np.log(log_clonesize)[None,:], single_X.shape[2], axis=0 )
+
+        # update elbo
+        prior_log_prob = np.zeros(n_clones)
+        for c in range(n_clones):
+            sampled_h = list_h[:, (c*n_obs):(c*n_obs+n_obs)]
+            prior_log_prob[c] = compute_state_prior_prob_highdim(sampled_h, res['new_log_startprob'], res['new_log_transmat'], lengths).mean()
+
+        # eventual_llf = [compute_potential(emission_llf + log_clonesize, l, spatial_weight, adjacency_mat) + prior_log_prob.sum() for l in list_labels]
+        eventual_llf = [compute_potential(emission_llf + log_clonesize, l, spatial_weight, adjacency_mat) for l in list_labels]
+        list_elbo.append( np.mean(eventual_llf) )
+
+        # compare the difference between the current and previous posterior_clones
+        if np.square(list_posterior_clones[-1] - list_posterior_clones[-2]).mean() < sampling_tol:
+            break
+
+        ##### temperature annealing #####
+        temperature = max(1.0, 0.95 * temperature)
+        # if r > 2:
+        #     temperature = max(1.0, temperature-2)
+
+    return list_posterior_clones, list_cna_states, list_emission_llf, list_log_mu,list_alphas, list_p_binom, list_taus, list_log_startprob, list_elbo, list_h, list_labels
+
+
+
+def plot_posterior_clones_single(list_posterior_clones, coords, idx, sample_ids=None):
+    """
+    Scatterplot of the probability of each spot being in each clone for iteration idx.
+    Attributes
+    ----------
+    list_posterior_clones : list of np.arrays, each one has size (num_spots, num_clones)
+        The posterior probability of each spot being in each clone at each iteration, output of infer_all
+    coords : np.array, size (num_spots, 2)
+        The spatial coordinates of each spot.
+    sample_ids : np.array, size (num_spots)
+        The sample id of each spot.
+    idx : int
+        The iteration index to plot.
+    """
+    shifted_coords = copy.copy(coords)
+    if not sample_ids is None:
+        offset = 0
+        for s in np.sort(np.unique(sample_ids)):
+            min_x = np.min(shifted_coords[sample_ids==s, 0])
+            shifted_coords[sample_ids==s, 0] = shifted_coords[sample_ids==s, 0] - min_x + offset
+            offset = np.max(shifted_coords[sample_ids==s, 0]) + 1
+
+    n_clones = list_posterior_clones[idx].shape[1]
+    fig, axes = plt.subplots(1, n_clones, figsize=(n_clones * 5*len(np.unique(sample_ids)), 5), facecolor='white', dpi=150)
+    for c in range(list_posterior_clones[idx].shape[1]):
+        axes[c].scatter(x=shifted_coords[:,0], y=-shifted_coords[:,1], c=list_posterior_clones[idx][:,c], cmap='magma_r', s=15, linewidth=0)
+        axes[c].set_title('Clone %d' % c)
+        norm = plt.Normalize(list_posterior_clones[idx][:,c].min(), list_posterior_clones[idx][:,c].max())
+        axes[c].figure.colorbar( plt.cm.ScalarMappable(cmap='magma_r', norm=norm), ax=axes[c] )
+        axes[c].axis('off')
+    fig.tight_layout()
+    return fig
+
+
+def plot_posterior_clones_interactive(list_posterior_clones, coords, giffile, sample_ids=None, base_duration=500):
+    import io
+    import imageio
+
+    shifted_coords = copy.copy(coords)
+    if not sample_ids is None:
+        offset = 0
+        for s in np.sort(np.unique(sample_ids)):
+            min_x = np.min(shifted_coords[sample_ids==s, 0])
+            shifted_coords[sample_ids==s, 0] = shifted_coords[sample_ids==s, 0] - min_x + offset
+            offset = np.max(shifted_coords[sample_ids==s, 0]) + 1
+    
+    n_clones = list_posterior_clones[0].shape[1]
+    # List to store images and durations
+    images = []
+    # durations = base_duration * np.arange(len(list_posterior_clones))  # Different durations for each frame
+
+    # Generate scatter plots and store them in memory
+    for idx in range(len(list_posterior_clones)):
+        fig, axes = plt.subplots(1, n_clones, figsize=(n_clones * 5*len(np.unique(sample_ids)), 5), dpi=150)
+        for c in range(list_posterior_clones[idx].shape[1]):
+            axes[c].scatter(x=shifted_coords[:,0], y=-shifted_coords[:,1], c=list_posterior_clones[idx][:,c], cmap='magma_r', s=15, linewidth=0)
+            axes[c].set_title('Clone %d' % c)
+            norm = plt.Normalize(list_posterior_clones[idx][:,c].min(), list_posterior_clones[idx][:,c].max())
+            axes[c].figure.colorbar( plt.cm.ScalarMappable(cmap='magma_r', norm=norm), ax=axes[c] )
+            axes[c].axis('off')
+        fig.suptitle(f"interation {idx}")
+        fig.tight_layout()
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png')
+        buf.seek(0)
+        images.append(imageio.imread(buf))
+        buf.close()
+        plt.close()
+
+    # Create a GIF from the in-memory images with different durations for each frame
+    imageio.mimsave(giffile, images, duration=base_duration)
+
+
+# def plot_posterior_clones_interactive(list_posterior_clones, coords, sample_ids=None):
+#     import mpl_interactions.ipyplot as iplt
+
+#     def c_func(x, y, ite):
+#         return list_posterior_clones[ite][:,c]
+
+#     shifted_coords = copy.copy(coords)
+#     if not sample_ids is None:
+#         offset = 0
+#         for s in np.sort(np.unique(sample_ids)):
+#             min_x = np.min(shifted_coords[sample_ids==s, 0])
+#             shifted_coords[sample_ids==s, 0] = shifted_coords[sample_ids==s, 0] - min_x + offset
+#             offset = np.max(shifted_coords[sample_ids==s, 0]) + 1
+
+#     # make a gif of the posterior probability of each clone in multiple figure panels
+#     n_clones = list_posterior_clones[0].shape[1]
+#     fig, axes = plt.subplots(1, n_clones, figsize=(n_clones * 5*len(np.unique(sample_ids)), 5), facecolor='white', dpi=150)
+#     for c in range(list_posterior_clones[0].shape[1]):
+#         _ = iplt.scatter(x=shifted_coords[:,0], y=-shifted_coords[:,0], ite=np.arange(len(list_posterior_clones)), c=c_func, cmap='magma_r', s=15, linewidth=0, ax=axes[c])
+#     fig.tight_layout()
+#     return fig
+
+
+def plot_posterior_baf_cnstate_single(single_X, single_total_bb_RD, single_tumor_prop, lengths, tumorprop_threshold, list_posterior_clones, list_cna_states, list_p_binom, idx, unique_chrs):
+    """
+    Scatterplot BAF along the genome colored by the hidden states at iteration idx.
+    """
+    posterior_clones = list_posterior_clones[idx]
+    X = (single_X[:,:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :])
+    total_bb_RD = single_total_bb_RD[:, single_tumor_prop>tumorprop_threshold] @ posterior_clones[single_tumor_prop>tumorprop_threshold, :]
+
+    n_clones = list_posterior_clones[idx].shape[1]
+    n_obs = single_X.shape[0]
+
+    fig, axes = plt.subplots(n_clones, 1, figsize=(20,1.8*n_clones), sharex=True, sharey=True, dpi=150, facecolor="white")
+    for s in np.arange(n_clones):
+        cid = s
+        this_pred = list_cna_states[idx][s,:]
+        segments, labs = get_intervals(this_pred)
+        seaborn.scatterplot(x=np.arange(X[:,1,cid].shape[0]), y=X[:,1,cid]/total_bb_RD[:,cid], \
+            hue=pd.Categorical(this_pred, categories=np.arange(20), ordered=True), palette="tab10", s=15, linewidth=0, alpha=0.8, legend=False, ax=axes[s])
+        axes[s].set_ylabel(f"clone {cid}\nphased AF")
+        axes[s].set_ylim([-0.1, 1.1])
+        axes[s].set_yticks([0, 0.5, 1])
+        axes[s].set_xticks([])
+        for i, seg in enumerate(segments):
+            axes[s].plot(seg, [list_p_binom[idx][labs[i],0], list_p_binom[idx][labs[i],0]], c="black", linewidth=3)
+            axes[s].plot(seg, [1-list_p_binom[idx][labs[i],0], 1-list_p_binom[idx][labs[i],0]], c="black", linewidth=3)
+
+    for i in range(len(lengths)):
+        median_len = (np.sum(lengths[:(i)]) + np.sum(lengths[:(i+1)])) / 2
+        axes[-1].text(median_len, -0.21, unique_chrs[i])
+        for k in range(n_clones):
+            axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1)
+
+    fig.tight_layout()
+    fig.subplots_adjust(hspace=0)
+    return fig
\ No newline at end of file
diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 7e3cbbb..aa8e2c1 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -78,6 +78,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
                                                **kwds)
 
 
+def fit_weighted_NegativeBinomial(y, features, weights, exposure, start_log_mu=None, start_alpha=None):
+    model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure)
+    if start_log_mu is None:
+        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+    else:
+        res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_log_mu, start_alpha), xtol=1e-4, ftol=1e-4)
+    n_llf = model.nloglikeobs(res.params)
+    return res.params, n_llf
+
+
 class Weighted_NegativeBinomial_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds):
         super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds)
@@ -106,6 +116,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
                                                **kwds)
 
 
+def fit_weighted_NegativeBinomial_mix(y, features, weights, exposure, tumor_prop, start_log_mu=None, start_alpha=None):
+    model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tumor_prop)
+    if start_log_mu is None:
+        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+    else:
+        res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_log_mu, start_alpha), xtol=1e-4, ftol=1e-4)
+    n_llf = model.nloglikeobs(res.params)
+    return res.params, n_llf
+
+
 class Weighted_BetaBinom(GenericLikelihoodModel):
     """
     Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
@@ -149,6 +169,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
                                                **kwds)
 
 
+def fit_weighted_BetaBinomial(y, features, weights, exposure, start_p_binom=None, start_tau=None):
+    model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
+    if start_p_binom is None:
+        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+    else:
+        res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_p_binom, start_tau), xtol=1e-4, ftol=1e-4)
+    n_llf = model.nloglikeobs(res.params)
+    return res.params, n_llf
+
+
 class Weighted_BetaBinom_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds):
         super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds)
@@ -175,6 +205,16 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
                                                **kwds)
 
 
+def fit_weighted_BetaBinomial_mix(y, features, weights, exposure, tumor_prop, start_p_binom=None, start_tau=None):
+    model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tumor_prop)
+    if start_p_binom is None:
+        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+    else:
+        res = model.fit(disp=0, maxiter=1500, start_params=np.append(start_p_binom, start_tau), xtol=1e-4, ftol=1e-4)
+    n_llf = model.nloglikeobs(res.params)
+    return res.params, n_llf
+
+
 class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
     def __init__(self, endog, exog, tau, weights, exposure, **kwds):
         super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds)
diff --git a/src/calicost/utils_distribution_fitting_gpu.py b/src/calicost/utils_distribution_fitting_gpu.py
new file mode 100644
index 0000000..fbb5544
--- /dev/null
+++ b/src/calicost/utils_distribution_fitting_gpu.py
@@ -0,0 +1,218 @@
+import numpy as np
+import scipy
+
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+except ImportError:
+    device = 'cpu'
+
+
+def nb_llf_allow_float(nb_mean, alphas, y):
+    """
+    Compute the log likelihood of negative binomial distribution allow float values in y.
+
+    Attributes:
+        nb_mean: torch.tensor. the mean of negative binomial distribution.
+        alphas: torch.tensor. the inverse-dispersion parameter of negative binomial distribution. variance = mean + alphas * mean^2.
+        y: torch.tensor. the observed values.
+    """
+    llf = (y + 1.0/alphas).lgamma() - (1.0/alphas).lgamma() - (y+1).lgamma() - 1.0/alphas*torch.log(1 + nb_mean*alphas) - y * torch.log(1 + nb_mean*alphas) + y * torch.log(alphas) + y * torch.log(nb_mean)
+    return llf
+
+
+def fit_weighted_NegativeBinomial_gpu(y, features, weights, exposure, start_log_mu=None, start_alpha=None, n_epochs=1000):
+    # convert to torch
+    y = torch.from_numpy(y).to(torch.float32).to(device)
+    features = torch.from_numpy(features).to(torch.float32).to(device)
+    weights = torch.from_numpy(weights).to(torch.float32).to(device)
+    exposure = torch.from_numpy(exposure).to(torch.float32).to(device)
+
+    # train
+    if start_log_mu is None:
+        log_mu = nn.Parameter(torch.zeros(features.shape[1], device=device), requires_grad=True)
+    else:
+        log_mu = nn.Parameter(torch.from_numpy(start_log_mu.flatten()).to(torch.float32).to(device), requires_grad=True)
+    if start_alpha is None:
+        log_disp = nn.Parameter(torch.log(0.1 * torch.ones(1, device=device)), requires_grad=True)
+    else:
+        log_disp = nn.Parameter(torch.log(start_alpha * torch.ones(1, device=device)), requires_grad=True)
+    
+    loss_list = []
+    optimizer = torch.optim.AdamW( [log_mu, log_disp], lr=0.1)
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1)
+    for epoch in range(int(n_epochs / 20)):
+        small_loss = []
+        for small_epoch in range(20):
+            optimizer.zero_grad()
+            # negative binomial llf
+            nb_mean = torch.exp(features @ log_mu) * exposure
+            nb_var = nb_mean + nb_mean**2 * torch.exp(log_disp)
+            # convert parameters
+            p = 1.0 / (1.0 + nb_mean * torch.exp(log_disp))
+            n = 1.0 / torch.exp(log_disp)
+            llf = torch.distributions.negative_binomial.NegativeBinomial(n, 1-p).log_prob(y)
+            loss = -torch.matmul(llf, weights)
+            small_loss.append( loss.item() )
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+        loss_list.append( np.mean(small_loss) )
+        # decide to terminate
+        if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y):
+            break
+
+    res = log_mu.detach().cpu().numpy().reshape(-1,1)
+    res = np.append(res, torch.exp(log_disp).detach().cpu().numpy()[0])
+    return res, loss_list[-1]
+
+
+def fit_weighted_NegativeBinomial_mix_gpu(y, features, weights, exposure, tumor_prop, start_log_mu=None, start_alpha=None, n_epochs=1000):
+    # convert to torch
+    y = torch.from_numpy(y).to(torch.float32).to(device)
+    features = torch.from_numpy(features).to(torch.float32).to(device)
+    weights = torch.from_numpy(weights).to(torch.float32).to(device)
+    exposure = torch.from_numpy(exposure).to(torch.float32).to(device)
+    tumor_prop = torch.from_numpy(tumor_prop).to(torch.float32).to(device)
+
+    # train
+    if start_log_mu is None:
+        log_mu = nn.Parameter(torch.zeros(features.shape[1], device=device), requires_grad=True)
+    else:
+        log_mu = nn.Parameter(torch.from_numpy(start_log_mu.flatten()).to(torch.float32).to(device), requires_grad=True)
+    if start_alpha is None:
+        log_disp = nn.Parameter(torch.log(0.1 * torch.ones(1, device=device)), requires_grad=True)
+    else:
+        log_disp = nn.Parameter(torch.log(start_alpha * torch.ones(1, device=device)), requires_grad=True)
+    
+    loss_list = []
+    optimizer = torch.optim.AdamW( [log_mu, log_disp], lr=0.1)
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1)
+    for epoch in range(int(n_epochs / 20)):
+        small_loss = []
+        for small_epoch in range(20):
+            optimizer.zero_grad()
+            # negative binomial llf
+            nb_mean = exposure * (torch.exp(features @ log_mu) * tumor_prop + (1 - tumor_prop))
+            # nb_var = nb_mean + nb_mean**2 * torch.exp(log_disp)
+            # convert parameters
+            p = 1.0 / (1.0 + nb_mean * torch.exp(log_disp))
+            n = 1.0 / torch.exp(log_disp)
+            llf = torch.distributions.negative_binomial.NegativeBinomial(n, 1-p).log_prob(y)
+            loss = -torch.matmul(llf, weights)
+            small_loss.append( loss.item() )
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+        loss_list.append( np.mean(small_loss) )
+        # decide to terminate
+        if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y):
+            break
+
+    res = log_mu.detach().cpu().numpy().reshape(-1,1)
+    res = np.append(res, torch.exp(log_disp).detach().cpu().numpy()[0])
+    return res, loss_list[-1]
+
+
+
+def fit_weighted_BetaBinomial_gpu(y, features, weights, exposure, start_p_binom=None, start_tau=None, min_binom_prob=0.01, max_binom_prob=0.99, 
+                      n_epochs=1000, MIN_TAUS = np.log(5), MAX_TAUS = np.log(1000)):
+    y = torch.from_numpy(y).to(torch.float32).to(device)
+    features = torch.from_numpy(features).to(torch.float32).to(device)
+    weights = torch.from_numpy(weights).to(torch.float32).to(device)
+    exposure = torch.from_numpy(exposure).to(torch.float32).to(device)
+
+    # initialize training parameters
+    if start_p_binom is None:
+        logistic_p = nn.Parameter(torch.logit(0.3 * torch.ones(features.shape[1], device=device)), requires_grad=True)
+    else:
+        logistic_p = nn.Parameter(torch.logit(torch.from_numpy(start_p_binom.flatten()).to(torch.float32).to(device)), requires_grad=True)
+    if start_tau is None:
+        log_taus = nn.Parameter(torch.log(20 * torch.ones(1, device=device)), requires_grad=True)
+    else:
+        log_taus = nn.Parameter(torch.log(start_tau * torch.ones(1, device=device)), requires_grad=True)
+
+    MIN_LOGISTIC_P = scipy.special.logit(min_binom_prob)
+    MAX_LOGISTIC_P = scipy.special.logit(max_binom_prob)
+    loss_list = []
+    optimizer = torch.optim.AdamW( [logistic_p, log_taus], lr=5e-2)
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1)
+    for epoch in range(int(n_epochs / 20)):
+        small_loss = []
+        for small_epoch in range(20):
+            optimizer.zero_grad()
+            # beta binomial llf
+            p = F.sigmoid(logistic_p)
+            d_choose_y = (exposure + 1).lgamma() - (y + 1).lgamma() - ((exposure - y) + 1).lgamma()
+            logbeta_numer = (y + (features @ p) * torch.exp(log_taus)).lgamma() + (exposure - y + (1 - features @ p) * torch.exp(log_taus)).lgamma() - (exposure + torch.exp(log_taus)).lgamma() 
+            logbeta_denom = ((features @ p) * torch.exp(log_taus)).lgamma() + ((1 - features @ p) * torch.exp(log_taus)).lgamma() - torch.exp(log_taus).lgamma() 
+            llf = d_choose_y + logbeta_numer - logbeta_denom
+            loss = -torch.matmul(llf, weights)
+            small_loss.append( loss.item() )
+            loss.backward()
+            log_taus.data = torch.clamp(log_taus.data, min=MIN_TAUS, max=MAX_TAUS)
+            logistic_p.data = torch.clamp(logistic_p.data, min=MIN_LOGISTIC_P, max=MAX_LOGISTIC_P)
+            optimizer.step()
+            scheduler.step()
+        loss_list.append( np.mean(small_loss) )
+        if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y):
+            break
+
+    res = F.sigmoid(logistic_p).detach().cpu().numpy().reshape(-1,1)
+    res = np.append(res, torch.exp(log_taus).detach().cpu().numpy()[0])
+    return res, loss_list[-1]
+
+
+def fit_weighted_BetaBinomial_mix_gpu(y, features, weights, exposure, tumor_prop, start_p_binom=None, start_tau=None, min_binom_prob=0.01, max_binom_prob=0.99, 
+                      n_epochs=1000, MIN_TAUS = np.log(5), MAX_TAUS = np.log(1000)):
+    y = torch.from_numpy(y).to(torch.float32).to(device)
+    features = torch.from_numpy(features).to(torch.float32).to(device)
+    weights = torch.from_numpy(weights).to(torch.float32).to(device)
+    exposure = torch.from_numpy(exposure).to(torch.float32).to(device)
+    tumor_prop = torch.from_numpy(tumor_prop).to(torch.float32).to(device)
+
+    # initialize training parameters
+    if start_p_binom is None:
+        logistic_p = nn.Parameter(torch.logit(0.3 * torch.ones(features.shape[1], device=device)), requires_grad=True)
+    else:
+        logistic_p = nn.Parameter(torch.logit(torch.from_numpy(start_p_binom.flatten()).to(torch.float32).to(device)), requires_grad=True)
+    if start_tau is None:
+        log_taus = nn.Parameter(torch.log(20 * torch.ones(1, device=device)), requires_grad=True)
+    else:
+        log_taus = nn.Parameter(torch.log(start_tau * torch.ones(1, device=device)), requires_grad=True)
+
+    MIN_LOGISTIC_P = scipy.special.logit(min_binom_prob)
+    MAX_LOGISTIC_P = scipy.special.logit(max_binom_prob)
+    loss_list = []
+    optimizer = torch.optim.AdamW( [logistic_p, log_taus], lr=5e-2)
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,700], gamma=0.1)
+    for epoch in range(int(n_epochs / 20)):
+        small_loss = []
+        for small_epoch in range(20):
+            optimizer.zero_grad()
+            # beta binomial llf
+            # pure-tumor BAF
+            p_tmp = F.sigmoid(logistic_p)
+            # shift BAF to 0.5 by weighted-tumor proportion for each genomic segment (weighted by total copy)
+            p = ((features @ p_tmp) * tumor_prop + 0.5 * (1 - tumor_prop)) # a vector of length num_genomic_segments
+            d_choose_y = (exposure + 1).lgamma() - (y + 1).lgamma() - ((exposure - y) + 1).lgamma()
+            logbeta_numer = (y + p * torch.exp(log_taus)).lgamma() + (exposure - y + (1 - p) * torch.exp(log_taus)).lgamma() - (exposure + torch.exp(log_taus)).lgamma() 
+            logbeta_denom = (p * torch.exp(log_taus)).lgamma() + ((1 - p) * torch.exp(log_taus)).lgamma() - torch.exp(log_taus).lgamma() 
+            llf = d_choose_y + logbeta_numer - logbeta_denom
+            # weight by weights, and compute gradient
+            loss = -torch.matmul(llf, weights)
+            small_loss.append( loss.item() )
+            loss.backward()
+            log_taus.data = torch.clamp(log_taus.data, min=MIN_TAUS, max=MAX_TAUS)
+            logistic_p.data = torch.clamp(logistic_p.data, min=MIN_LOGISTIC_P, max=MAX_LOGISTIC_P)
+            optimizer.step()
+            scheduler.step()
+        loss_list.append( np.mean(small_loss) )
+        if len(loss_list) > 2 and np.abs(loss_list[-1] - np.min(loss_list[:-1])) < 1e-6 * len(y):
+            break
+
+    res = F.sigmoid(logistic_p).detach().cpu().numpy().reshape(-1,1)
+    res = np.append(res, torch.exp(log_taus).detach().cpu().numpy()[0])
+    return res, loss_list[-1]
\ No newline at end of file
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 9c22aaf..2a243d3 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -7,6 +7,141 @@
 from calicost.utils_distribution_fitting import *
 
 
+#############################################################
+# Sampling hidden state sequences from posterior distribution
+#############################################################
+
+def multinoulli_sampler(log_probs):
+    # assume log_probs is already normalized to have logsumexp = 0
+    cdf = np.cumsum(np.exp(log_probs))
+    cdf[-1] = 1.1 # to avoid numerical issue
+    u = np.random.uniform()
+    return np.argmax(cdf > u)
+
+
+def FFBS(num_sequences, log_alpha, log_gamma, log_transmat, lengths):
+    """
+    Attributes
+    ----------
+    num_sequences: number of hidden sequences to draw.
+    log_alpha: n_states * n_observations per chromosome. Log of forward DP table. alpha[i,t] = P(o_1, ..., o_t, q_t=i | lambda).
+    log_gamma: n_states * n_observations per chromosome. Log of posterior probability. gamma[i,t] = P(q_t=i | o_1, ..., o_T, lambda
+    log_transmat: n_states * n_states. Transition probability after log transformation.
+    log_startprob: n_states. Start probability after log transformation.
+    lengths: sum of lengths = n_observations. Each length defines a independent HMM sequence
+    
+    Returns
+    -------
+    list_h: list of hidden state sequences. The length of list is num_sequences.
+    """
+    def pre_compute_S(log_alpha, log_transmat, lengths):
+        n_states = log_alpha.shape[0]
+        n_obs = log_alpha.shape[1]
+        log_S = np.zeros((n_obs, n_states, n_states))
+        cumlen = 0
+        for le in lengths:
+            for t in np.arange(le):
+                for i in np.arange(n_states):
+                    for j in np.arange(n_states):
+                        if t == le - 1:
+                            log_S[cumlen+t,i,j] = log_gamma[i,cumlen+t]
+                        else:
+                            log_S[cumlen+t,i,j] = log_alpha[i,cumlen+t] + log_transmat[i,j]
+            cumlen += le
+        # normalize
+        log_S = log_S - scipy.special.logsumexp(log_S, axis=1, keepdims=True)
+        return log_S
+    #
+    log_S = pre_compute_S(log_alpha, log_transmat, lengths)
+    # sample hidden state sequences
+    list_h = []
+    for i in trange(num_sequences):
+        this_h = []
+        for t in np.arange(np.sum(lengths)-1, -1, -1):
+            if t == np.sum(lengths)-1:
+                this_h.append( multinoulli_sampler(log_S[t, :, 0]) )
+            else:
+                this_h.append( multinoulli_sampler(log_S[t, :, this_h[-1] ]) )
+        list_h.append( np.array(this_h[::-1]) )
+    return list_h
+
+
+def multinoulli_sampler_high_dimensional(log_probs):
+    """
+    log_probs: n_obs * n_states. Log probability. Assume log_probs[i,:] is normalized to have logsumexp = 0
+    """
+    cdf = np.cumsum(np.exp(log_probs), axis=1)
+    cdf[:, -1] = 1.1 # to avoid numerical issue
+    u = np.random.uniform(size=log_probs.shape[0])
+    return np.sum(cdf <= u.reshape(-1,1), axis=1)
+
+
+def FFBS_faster(num_sequences, log_alpha, log_gamma, log_transmat, lengths):
+    """
+    Attributes
+    ----------
+    num_sequences: number of hidden sequences to draw.
+    log_alpha: n_states * n_observations per chromosome. Log of forward DP table. alpha[i,t] = P(o_1, ..., o_t, q_t=i | lambda).
+    log_gamma: n_states * n_observations per chromosome. Log of posterior probability. gamma[i,t] = P(q_t=i | o_1, ..., o_T, lambda
+    log_transmat: n_states * n_states. Transition probability after log transformation.
+    log_startprob: n_states. Start probability after log transformation.
+    lengths: sum of lengths = n_observations. Each length defines a independent HMM sequence
+    
+    Returns
+    -------
+    list_h: list of hidden state sequences. The length of list is num_sequences.
+    """
+    def pre_compute_S(log_alpha, log_transmat, lengths):
+        n_states = log_alpha.shape[0]
+        n_obs = log_alpha.shape[1]
+        log_S = np.zeros((n_obs, n_states, n_states))
+        cumlen = 0
+        for le in lengths:
+            for t in np.arange(le):
+                for i in np.arange(n_states):
+                    for j in np.arange(n_states):
+                        if t == le - 1:
+                            log_S[cumlen+t,i,j] = log_gamma[i,cumlen+t]
+                        else:
+                            log_S[cumlen+t,i,j] = log_alpha[i,cumlen+t] + log_transmat[i,j]
+            cumlen += le
+        # normalize
+        log_S = log_S - scipy.special.logsumexp(log_S, axis=1, keepdims=True)
+        return log_S
+    #
+    log_S = pre_compute_S(log_alpha, log_transmat, lengths)
+    # sample hidden state sequences
+    list_h = []
+    for t in np.arange(np.sum(lengths)-1, -1, -1):
+        if t == np.sum(lengths)-1:
+            list_h.append( multinoulli_sampler_high_dimensional(log_S[t, :, np.zeros(num_sequences,dtype=int)]) )
+        else:
+            list_h.append( multinoulli_sampler_high_dimensional(log_S[t, :, list_h[-1] ]) )
+    list_h = np.array(list_h).T
+    # reverse along the second dimension
+    list_h = list_h[:,::-1]
+    return list_h
+
+
+def compute_state_prior_prob_highdim(h, log_startprob, log_transmat, lengths):
+    """
+    Attributes
+    ----------
+    h: hidden state sequence of size n_sequences * n_obs.
+    log_startprob: n_states. Start probability after log transformation.
+    log_transmat: n_states * n_states. Transition probability after log transformation.
+    lengths: sum of lengths = n_observations. Each length defines a independent HMM sequence.
+    """
+    num_sequences = h.shape[0]
+    logprob = np.zeros(num_sequences)
+    cumlen = 0
+    for le in lengths:
+        logprob += log_startprob[h[:, cumlen]]
+        logprob += log_transmat[ (h[:, cumlen:(cumlen+le-1)], h[:, (cumlen+1):(cumlen+le)]) ].sum(axis=1)
+        cumlen += le
+    return logprob
+
+
 @njit
 def np_max_ax_squeeze(arr, axis=0):
     assert arr.ndim == 2
@@ -144,7 +279,7 @@ def construct_unique_matrix(obs_count, total_count):
         if total_count.dtype == int:
             pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T, axis=0 )
         else:
-            pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T.round(decimals=4), axis=0 )
+            pairs = np.unique( np.vstack([obs_count[:,s].round(decimals=4), total_count[:,s]]).T.round(decimals=4), axis=0 )
         unique_values.append( pairs )
         pair_index = {(pairs[i,0], pairs[i,1]):i for i in range(pairs.shape[0])}
         # construct mapping matrix
@@ -154,7 +289,7 @@ def construct_unique_matrix(obs_count, total_count):
             if total_count.dtype == int:
                 tmpidx = pair_index[(obs_count[i,s], total_count[i,s])]
             else:
-                tmpidx = pair_index[(obs_count[i,s], total_count[i,s].round(decimals=4))]
+                tmpidx = pair_index[(obs_count[i,s].round(decimals=4), total_count[i,s].round(decimals=4))]
             mat_col[i] = tmpidx
         mapping_matrices.append( scipy.sparse.csr_matrix((np.ones(len(mat_row)), (mat_row, mat_col) )) )
     return unique_values, mapping_matrices
@@ -370,7 +505,7 @@ def update_transition_sitewise(log_xi, is_diag=False):
 
 
 def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2, min_estep_weight=0.1):
+    fun=fit_weighted_NegativeBinomial, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2, min_estep_weight=0.1):
     """
     Attributes
     ----------
@@ -410,18 +545,21 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
                 idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
                 for i in range(n_states):
-                    model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \
+                    res, n_llf = fun(unique_values[s][idx_nonzero,0], \
                                 np.ones(len(idx_nonzero)).reshape(-1,1), \
                                 weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], \
-                                exposure=unique_values[s][idx_nonzero,1], \
-                                penalty=0)
-                    res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0]
-                    new_alphas[i, s] = res.params[-1]
-                    if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                                exposure=unique_values[s][idx_nonzero,1],
+                                start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    new_log_mu[i, s] = res[0]
+                    new_alphas[i, s] = res[-1]
+                    # if not (start_log_mu is None):
+                    #     res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \
+                    #                     np.ones(len(idx_nonzero)).reshape(-1,1), \
+                    #                     weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], \
+                    #                     exposure=unique_values[s][idx_nonzero,1], \
+                    #                     start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    #     new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                    #     new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -449,30 +587,33 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
-            model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure,
+                             start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+                             start_alpha=alphas[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
-            if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                new_log_mu[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_alphas[:,:] = res[-1]
+            # if not (start_log_mu is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, 
+            #                                                  start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+            #                                                  start_alpha=alphas[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_log_mu[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_alphas[:,:] = res2[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
 def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, tumor_prop, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+    fun=fit_weighted_NegativeBinomial_mix, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
     """
     Attributes
     ----------
@@ -513,18 +654,20 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat
                 idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
                 for i in range(n_states):
                     this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
-                    model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \
+                    res, n_llf = fun(unique_values[s][idx_nonzero,0], \
                                 np.ones(len(idx_nonzero)).reshape(-1,1), \
                                 weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \
-                                tumor_prop=this_tp)
-                                # tumor_prop=tumor_prop[s], penalty=0)
-                    res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0]
-                    new_alphas[i, s] = res.params[-1]
-                    if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                                tumor_prop=this_tp, start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    new_log_mu[i, s] = res[0]
+                    new_alphas[i, s] = res[-1]
+                    # if not (start_log_mu is None):
+                    #     res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \
+                    #                     np.ones(len(idx_nonzero)).reshape(-1,1), \
+                    #                     weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \
+                    #                     tumor_prop=this_tp, 
+                    #                     start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    #     new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                    #     new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -558,30 +701,33 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp,
+                             start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+                             start_alpha=alphas[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
-            if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                new_log_mu[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_alphas[:,:] = res[-1]
+            # if not (start_log_mu is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, 
+            #                                                      start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+            #                                                      start_alpha=alphas[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_log_mu[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_alphas[:,:] = res2[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
 def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
+    fun=fit_weighted_BetaBinomial, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
     percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
     """
     Attributes
@@ -626,17 +772,21 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice
                 for i in range(n_states):
                     # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
                     if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1:
-                        model = Weighted_BetaBinom(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
+                        res, n_llf = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
                             np.ones(2*len(idx_nonzero)).reshape(-1,1), \
                             weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
-                            exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]) )
-                        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0]
-                        new_taus[i, s] = res.params[-1]
-                        if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),
+                            start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s] )
+                        new_p_binom[i, s] = res[0]
+                        new_taus[i, s] = res[-1]
+                        # if not (start_p_binom is None):
+                        #     res2, n_llf2 = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
+                        #         np.ones(2*len(idx_nonzero)).reshape(-1,1), \
+                        #         weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
+                        #         exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), \
+                        #         start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s])
+                        #     new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                        #     new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -664,30 +814,33 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
-            model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure,
+                             start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+                             start_tau=taus[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
-            if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                new_p_binom[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_taus[:,:] = res[-1]
+            # if not (start_p_binom is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, 
+            #                                              start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+            #                                              start_tau=taus[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_p_binom[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_taus[:,:] = res2[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
 
 
 def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, tumor_prop, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
+    fun=fit_weighted_BetaBinomial_mix, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
     percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
     """
     Attributes
@@ -738,19 +891,22 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat
                     if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1:
                         this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
                         assert np.all(this_tp < 1+1e-4)
-                        model = Weighted_BetaBinom_mix(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
+                        res, n_llf = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
                             np.ones(2*len(idx_nonzero)).reshape(-1,1), \
                             weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
                             exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),\
-                            tumor_prop=this_tp)
-                            # tumor_prop=tumor_prop )
-                        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0]
-                        new_taus[i, s] = res.params[-1]
-                        if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            tumor_prop=this_tp, start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s])
+                        new_p_binom[i, s] = res[0]
+                        new_taus[i, s] = res[-1]
+                        # if not (start_p_binom is None):
+                        #     res2, n_llf2 = fun(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
+                        #         np.ones(2*len(idx_nonzero)).reshape(-1,1), \
+                        #         weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
+                        #         exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),\
+                        #         tumor_prop=this_tp, 
+                        #         start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s])
+                        #     new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                        #     new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -784,23 +940,26 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp,
+                             start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+                             start_tau=taus[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
-            if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                new_p_binom[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_taus[:,:] = res[-1]
+            # if not (start_p_binom is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, 
+            #                                                  start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+            #                                                  start_tau=taus[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_p_binom[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_taus[:,:] = res2[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
@@ -867,7 +1026,7 @@ def update_transition_nophasing(log_xi, is_diag=False):
 
 
 def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, alphas, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+    fun=fit_weighted_NegativeBinomial, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
     """
     Attributes
     ----------
@@ -907,18 +1066,21 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
                 idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
                 for i in range(n_states):
-                    model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \
+                    res, n_llf = fun(unique_values[s][idx_nonzero,0], \
                                 np.ones(len(idx_nonzero)).reshape(-1,1), \
                                 weights=tmp[i,idx_nonzero], \
-                                exposure=unique_values[s][idx_nonzero,1], \
-                                penalty=0)
-                    res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0]
-                    new_alphas[i, s] = res.params[-1]
-                    if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                                exposure=unique_values[s][idx_nonzero,1],
+                                start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    new_log_mu[i, s] = res[0]
+                    new_alphas[i, s] = res[-1]
+                    # if not (start_log_mu is None):
+                    #     res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \
+                    #             np.ones(len(idx_nonzero)).reshape(-1,1), \
+                    #             weights=tmp[i,idx_nonzero], \
+                    #             exposure=unique_values[s][idx_nonzero,1], \
+                    #             start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    #     new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                    #     new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -946,30 +1108,33 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
-            model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure,
+                             start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+                             start_alpha=alphas[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
-            if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                new_log_mu[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_alphas[:,:] = res[-1]
+            # if not (start_log_mu is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, 
+            #                                                  start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+            #                                                  start_alpha=alphas[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_log_mu[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_alphas[:,:] = res2[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
 def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, alphas, tumor_prop, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+    fun=fit_weighted_NegativeBinomial_mix, start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
     """
     Attributes
     ----------
@@ -1010,18 +1175,20 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma
                 idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
                 for i in range(n_states):
                     this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
-                    model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \
+                    res, n_llf = fun(unique_values[s][idx_nonzero,0], \
                                 np.ones(len(idx_nonzero)).reshape(-1,1), \
                                 weights=tmp[i,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \
-                                tumor_prop=this_tp)
-                                # tumor_prop=tumor_prop[s], penalty=0)
-                    res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0]
-                    new_alphas[i, s] = res.params[-1]
-                    if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                                tumor_prop=this_tp, start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    new_log_mu[i, s] = res[0]
+                    new_alphas[i, s] = res[-1]
+                    # if not (start_log_mu is None):
+                    #     res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \
+                    #                     np.ones(len(idx_nonzero)).reshape(-1,1), \
+                    #                     weights=tmp[i,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \
+                    #                     tumor_prop=this_tp, 
+                    #                     start_log_mu=start_log_mu[i:(i+1), s:(s+1)], start_alpha=alphas[i, s])
+                    #     new_log_mu[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                    #     new_alphas[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -1055,30 +1222,33 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp,
+                             start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+                             start_alpha=alphas[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
-            if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                new_log_mu[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_alphas[:,:] = res[-1]
+            # if not (start_log_mu is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, 
+            #                                                      start_log_mu=np.array([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+            #                                                      start_alpha=alphas[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_log_mu[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_alphas[:,:] = res2[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
 def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, taus, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
+    fun=fit_weighted_BetaBinomial, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
     percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
     """
     Attributes
@@ -1123,17 +1293,21 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric
                 for i in range(n_states):
                     # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
                     if np.sum(tmp[i,idx_nonzero]) >= 0.1:
-                        model = Weighted_BetaBinom(unique_values[s][idx_nonzero,0], \
+                        res, n_llf = fun(unique_values[s][idx_nonzero,0], \
                             np.ones(len(idx_nonzero)).reshape(-1,1), \
                             weights=tmp[i,idx_nonzero], \
-                            exposure=unique_values[s][idx_nonzero,1] )
-                        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0]
-                        new_taus[i, s] = res.params[-1]
-                        if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            exposure=unique_values[s][idx_nonzero,1],
+                            start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s] )
+                        new_p_binom[i, s] = res[0]
+                        new_taus[i, s] = res[-1]
+                        # if not (start_p_binom is None):
+                        #     res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \
+                        #         np.ones(len(idx_nonzero)).reshape(-1,1), \
+                        #         weights=tmp[i,idx_nonzero], \
+                        #         exposure=unique_values[s][idx_nonzero,1], 
+                        #         start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s])
+                        #     new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                        #     new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -1161,23 +1335,26 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
-            model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure,
+                             start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+                             start_tau=taus[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
-            if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                new_p_binom[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_taus[:,:] = res[-1]
+            # if not (start_p_binom is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, 
+            #                                              start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]), 
+            #                                              start_tau=taus[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_p_binom[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_taus[:,:] = res2[-1]
 
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
@@ -1185,7 +1362,7 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric
 
 
 def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, taus, tumor_prop, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
+    fun=fit_weighted_BetaBinomial_mix, start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
     percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
     """
     Attributes
@@ -1236,19 +1413,22 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma
                     if np.sum(tmp[i,idx_nonzero]) >= 0.1:
                         this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
                         assert np.all(this_tp < 1 + 1e-4)
-                        model = Weighted_BetaBinom_mix(unique_values[s][idx_nonzero,0], \
+                        res, n_llf = fun(unique_values[s][idx_nonzero,0], \
                             np.ones(len(idx_nonzero)).reshape(-1,1), \
                             weights=tmp[i,idx_nonzero], \
                             exposure=unique_values[s][idx_nonzero,1], \
-                            tumor_prop=this_tp)
-                            # tumor_prop=tumor_prop[s] )
-                        res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0]
-                        new_taus[i, s] = res.params[-1]
-                        if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            tumor_prop=this_tp, start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s])
+                        new_p_binom[i, s] = res[0]
+                        new_taus[i, s] = res[-1]
+                        # if not (start_p_binom is None):
+                        #     res2, n_llf2 = fun(unique_values[s][idx_nonzero,0], \
+                        #         np.ones(len(idx_nonzero)).reshape(-1,1), \
+                        #         weights=tmp[i,idx_nonzero], \
+                        #         exposure=unique_values[s][idx_nonzero,1], \
+                        #         tumor_prop=this_tp, 
+                        #         start_p_binom=start_p_binom[i:(i+1), s:(s+1)], start_tau=taus[i, s])
+                        #     new_p_binom[i, s] = res[0] if n_llf < n_llf2 else res2[0]
+                        #     new_taus[i, s] = res[-1] if n_llf < n_llf2 else res2[-1]
         else:
             exposure = []
             y = []
@@ -1282,23 +1462,26 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            res, n_llf = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp,
+                             start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+                             start_tau=taus[0,s])
             for s,idx_state_posweight in enumerate(state_posweights):
                 l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
                 l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
-            if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
-            if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
-                        new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
-                    if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                new_p_binom[idx_state_posweight, s] = res[l1:l2]
+            if res[-1] > 0:
+                new_taus[:,:] = res[-1]
+            # if not (start_p_binom is None):
+            #     res2, n_llf2 = fun(y, features, weights=weights, exposure=exposure, tumor_prop=tp, 
+            #                                                  start_p_binom=np.array([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)]),
+            #                                                  start_tau=taus[0,s])
+            #     if n_llf2 < n_llf:
+            #         for s,idx_state_posweight in enumerate(state_posweights):
+            #             l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
+            #             l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            #             new_p_binom[idx_state_posweight, s] = res2[l1:l2]
+            #         if res2[-1] > 0:
+            #             new_taus[:,:] = res2[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py
index bee9f42..4920fd3 100644
--- a/src/calicost/utils_hmrf.py
+++ b/src/calicost/utils_hmrf.py
@@ -399,11 +399,14 @@ def reorder_results_merged(res, n_obs):
 def load_hmrf_last_iteration(filename):
     allres = dict( np.load(filename, allow_pickle=True) )
     r = allres["num_iterations"] - 1
-    res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-        "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-        "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-        "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-        "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
+    res = {}
+    for k,v in allres.items():
+        if k.startswith(f"round{r}_"):
+            res[k.replace(f"round{r}_", "")] = v
+    # special case for assignment
+    del res["assignment"]
+    res['new_assignment'] = allres[f"round{r}_assignment"]
+    res['prev_assignment'] = allres[f"round{r-1}_assignment"]
     if "barcodes" in allres.keys():
         res["barcodes"] = allres["barcodes"]
     return res
@@ -411,11 +414,14 @@ def load_hmrf_last_iteration(filename):
 
 def load_hmrf_given_iteration(filename, r):
     allres = dict( np.load(filename, allow_pickle=True) )
-    res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-        "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-        "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-        "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-        "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
+    res = {}
+    for k,v in allres.items():
+        if k.startswith(f"round{r}_"):
+            res[k.replace(f"round{r}_", "")] = v
+    # special case for assignment
+    del res["assignment"]
+    res['new_assignment'] = allres[f"round{r}_assignment"]
+    res['prev_assignment'] = allres[f"round{r-1}_assignment"]
     if "barcodes" in allres.keys():
         res["barcodes"] = allres["barcodes"]
     return res
diff --git a/src/calicost/vi_main.py b/src/calicost/vi_main.py
new file mode 100644
index 0000000..9529d52
--- /dev/null
+++ b/src/calicost/vi_main.py
@@ -0,0 +1,109 @@
+import sys
+import numpy as np
+import scipy
+import pandas as pd
+from pathlib import Path
+from sklearn.metrics import adjusted_rand_score
+from sklearn.cluster import KMeans
+import scanpy as sc
+import anndata
+import logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+logger = logging.getLogger()
+import copy
+from pathlib import Path
+import functools
+import subprocess
+from calicost.arg_parse import *
+from calicost.hmm_NB_BB_phaseswitch import *
+from calicost.utils_distribution_fitting import *
+from calicost.utils_hmrf import *
+from calicost.hmrf import *
+from calicost.phasing import *
+from calicost.utils_IO import *
+from calicost.find_integer_copynumber import *
+from calicost.parse_input import *
+from calicost.utils_plotting import *
+from calicost.sample_posterior_clone import *
+from calicost.hmm_NB_BB_nophasing_float import *
+
+
+def main(configuration_file):
+    try:
+        config = read_configuration_file(configuration_file)
+    except:
+        config = read_joint_configuration_file(configuration_file)
+    print("Configurations:")
+    for k in sorted(list(config.keys())):
+        print(f"\t{k} : {config[k]}")
+
+    lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
+        barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config)
+    
+    single_tumor_prop[np.isnan(single_tumor_prop)] = 0.05
+
+    copy_single_X_rdr = copy.copy(single_X[:,0,:])
+    copy_single_base_nb_mean = copy.copy(single_base_nb_mean)
+    single_X[:,0,:] = 0
+    single_base_nb_mean[:,:] = 0
+
+    # # remove spots with tumor purity less than config['tumorprop_threshold']
+    # single_X = single_X[:,:,single_tumor_prop > config['tumorprop_threshold']]
+    # single_base_nb_mean = single_base_nb_mean[:,single_tumor_prop > config['tumorprop_threshold']]
+    # single_total_bb_RD = single_total_bb_RD[:,single_tumor_prop > config['tumorprop_threshold']]
+    # barcodes = barcodes[single_tumor_prop > config['tumorprop_threshold']]
+    # coords = coords[single_tumor_prop > config['tumorprop_threshold'],:]
+    # sample_ids = sample_ids[single_tumor_prop > config['tumorprop_threshold']]
+    # adjacency_mat = adjacency_mat[single_tumor_prop > config['tumorprop_threshold'],:][:,single_tumor_prop > config['tumorprop_threshold']]
+    # smooth_mat = smooth_mat[single_tumor_prop > config['tumorprop_threshold'],:][:,single_tumor_prop > config['tumorprop_threshold']]
+    # single_tumor_prop = single_tumor_prop[single_tumor_prop > config['tumorprop_threshold']]
+
+    # Run HMRF
+    for r_hmrf_initialization in range(config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]):
+        outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}"
+        if config["tumorprop_file"] is None:
+            initial_clone_index = rectangle_initialize_initial_clone(coords, config["n_clones"], random_state=r_hmrf_initialization)
+        else:
+            initial_clone_index = rectangle_initialize_initial_clone_mix(coords, config["n_clones"], single_tumor_prop, threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization)
+        
+        # create directory
+        p = subprocess.Popen(f"mkdir -p {outdir}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        out,err = p.communicate()
+        # save clone initialization into npz file
+        prefix = "vi_gibbs_hightp_allspots"
+        initial_assignment = np.zeros(single_X.shape[2], dtype=int)
+        for c,idx in enumerate(initial_clone_index):
+            initial_assignment[idx] = c
+
+        posterior_clones = np.zeros((single_X.shape[2], config['n_clones']))
+        posterior_clones[ (np.arange(single_X.shape[2]), initial_assignment) ] = 1
+
+        # run HMRF + HMM
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=config["tumorprop_threshold"])
+        init_log_mu, init_p_binom = initialization_by_gmm(config['n_states'], np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
+            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params='sp', random_state=config['gmm_random_state'], in_log_space=False, only_minor=False)
+        
+        list_posterior_clones, list_cna_states, list_emission_llf, list_log_mu,list_alphas, list_p_binom, list_taus, list_log_startprob, list_elbo, list_h, list_labels = infer_all_v2(
+                            single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, posterior_clones, config['n_states'],
+                            coords, adjacency_mat, config['tumorprop_threshold'], config['spatial_weight'], 'visium', max_iter_outer=20, num_chains=20, burnin=50, sampling_tol=1e-10, temperature=2.0,
+                            hmmclass=hmm_nophasing_float, hmm_params='sp', hmm_t=config['t'], hmm_random_state=config['gmm_random_state'], hmm_max_iter=config['max_iter'], hmm_tol=config['tol'], hmm_num_draws=200, fun=block_gibbs_sampling_labels, 
+                            smooth_mat=smooth_mat, init_p_binom=init_p_binom)
+        
+        # save results
+        np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", 
+                 list_posterior_clones=list_posterior_clones, 
+                 list_cna_states=list_cna_states, 
+                 list_emission_llf=list_emission_llf,
+                 list_log_mu=list_log_mu, 
+                 list_p_binom=list_p_binom, 
+                 list_elbo=list_elbo, 
+                 list_h=list_h, 
+                 list_labels=list_labels)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
+    args = parser.parse_args()
+
+    main(args.configfile)
\ No newline at end of file