From 241208c03054b1db1b1af94fed155cb2c690a89a Mon Sep 17 00:00:00 2001
From: Cong Ma <dorismacong@gmail.com>
Date: Thu, 8 Aug 2024 11:36:33 -0400
Subject: [PATCH 001/125] main script with more comments

---
 src/calicost/calicost_main.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 8990bc8..0b0add0 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -35,9 +35,18 @@ def main(configuration_file):
     for k in sorted(list(config.keys())):
         print(f"\t{k} : {config[k]}")
 
+    # Assuming the B counts are calculated by the cellsnp-lite and Eagle pipeline
+    # If assuming each spot contains a mixture of normal/tumor cells, the tumor proportion should be provided in the config file.
+    # load data
+    ## If the data is loaded for the first time: infer phasing using phase-switch HMM (hmm_NB_BB_phaseswitch.py and phasing.py) -> output initial_phase.npz, matrices in parsed_inputs folder
+    ## If the data is already loaded: load the matrices from parsed_inputs folder
     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
         barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config)
     
+    """
+    Initial clustering spots using only BAF values.
+    """
+    # setting transcript count to 0, and baseline so that emission probability calculation will ignore them.
     copy_single_X_rdr = copy.copy(single_X[:,0,:])
     copy_single_base_nb_mean = copy.copy(single_base_nb_mean)
     single_X[:,0,:] = 0
@@ -64,6 +73,8 @@ def main(configuration_file):
             np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres)
 
         # run HMRF + HMM
+        # store the results of each iteration of HMRF in a npz file outdir/prefix_nstates{config['n_states']}_sp.npz
+        # if a specific iteration is computed, hmrf will directly load the results from the file
         if config["tumorprop_file"] is None:
             hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=config["n_states"], \
                 log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \
@@ -88,6 +99,8 @@ def main(configuration_file):
         else:
             X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))], single_tumor_prop, threshold=config["tumorprop_threshold"])
             tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1)
+        # merge "similar" clones from the initial number of clones.
+        # "similar" defined by Neyman Pearson statistics/ Likelihood ratios P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B)
         merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="sp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2)
         print(f"BAF clone merging after comparing similarity: {merging_groups}")
         #
@@ -99,7 +112,7 @@ def main(configuration_file):
         n_baf_clones = len(merging_groups)
         np.savez(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res)
 
-        # adjust phasing
+        # load merged results
         n_obs = single_X.shape[0]
         merged_res = dict(np.load(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", allow_pickle=True))
         merged_baf_assignment = copy.copy(merged_res["new_assignment"])
@@ -109,8 +122,12 @@ def main(configuration_file):
         merged_baf_profiles = np.array([ np.where(pred[c,:] < config["n_states"], merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0], 1-merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0]) \
                                         for c in range(n_baf_clones) ])
         
+        """
+        Refined clustering using BAF and RDR values.
+        """
         # adding RDR information
         if not config["bafonly"]:
+            # Only used when assuming each spot is pure normal or tumor and if we don't know which spots are normal spots.
             # select normal spots
             if (config["normalidx_file"] is None) and (config["tumorprop_file"] is None):
                 EPS_BAF = 0.05
@@ -129,19 +146,24 @@ def main(configuration_file):
                 # single_base_nb_mean has already been added in loading data step.
                 if not config["tumorprop_file"] is None:
                     logger.warning(f"Mixed sources of information for normal spots! Using {config['normalidx_file']}")
+            
+            # If tumor purity is provided, we can use it to select normal spots.
             else:
                 for prop_threshold in np.arange(0.05, 0.6, 0.05):
                     normal_candidate = (single_tumor_prop < prop_threshold)
                     if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200:
                         break
-            # filter bins based on normal
+            # To avoid allele-specific expression that are not relevant to CNA, filter bins where normal pseudobulk has large |BAF - 0.5|
             index_normal = np.where(normal_candidate)[0]
             lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
                 single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file'])
             assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]
             df_bininfo = genesnp_to_bininfo(df_gene_snp)
             copy_single_X_rdr = copy.copy(single_X[:,0,:])
-            # filter out high-UMI DE genes, which may bias RDR estimates
+
+            # If a gene has way higher expression than adjacent genes, its transcript count will dominate RDR values
+            # To avoid the domination, filter out high-UMI DE genes, which may bias RDR estimates
+            # Assume the remaining genes will still carry the CNA info.
             copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=sample_list, sample_ids=sample_ids)
             MIN_NORMAL_COUNT_PERBIN = 20
             bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0]
@@ -166,6 +188,7 @@ def main(configuration_file):
                 if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: # put a minimum B allele read count on pseudobulk to split clones
                     continue
                 # initialize clone
+                # write the initialization in a npz file outdir/prefix_nstates{config['n_states']}_smp.npz
                 if config["tumorprop_file"] is None:
                     initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], config['n_clones_rdr'], random_state=r_hmrf_initialization)
                 else:

From 96e873fa36a54416d96d730789ad9a93db93bd5f Mon Sep 17 00:00:00 2001
From: Cong Ma <dorismacong@gmail.com>
Date: Thu, 8 Aug 2024 13:41:13 -0400
Subject: [PATCH 002/125] add comment to hmrf main

---
 src/calicost/calicost_main.py |  1 +
 src/calicost/hmrf.py          | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 0b0add0..8b144df 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -298,6 +298,7 @@ def main(configuration_file):
                 log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
                 log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
             # final re-assignment across all clones using estimated RDR + BAF
+            # The following step may not be needed because of other improvements. And it may cause mistakes in some cases.
             if config["tumorprop_file"] is None:
                 if config["nodepotential"] == "max":
                     pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 185ca3a..e8e862f 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -27,11 +27,20 @@
 ############################################################
 
 def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+    """
+    Choosing clones by Iterated Conditional Modes (Forward-backward version):
+    for which the emission probability is given by the posterior probability of all HMM states at each bin.
+    Input format assumption: the RDR/BAF vectors are not shared across clones <- after clone refinement with RDR+BAF signals.
+
+    HMRF likelihood: node potential where each node is a spot. And edge potential.
+    Node potential: likelihood of the data given HMM states of each clone.
+    Edge potential: Potts model.
+    """
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
     n_states = res["new_p_binom"].shape[0]
-    single_llf = np.zeros((N, n_clones))
+    single_llf = np.zeros((N, n_clones)) # node potential
     new_assignment = copy.copy(prev_assignment)
     #
     posterior = np.zeros((N, n_clones))
@@ -72,6 +81,12 @@ def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_R
 
 
 def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+    """
+    Choosing clones by Iterated Conditional Modes (Viterbi version):
+    for which the emission probability of each spot is a single of HMM state sequence.
+    Input format assumption: the RDR/BAF vectors are not shared across clones <- after clone refinement with RDR+BAF signals.
+
+    """
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
@@ -116,6 +131,9 @@ def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, re
 
 
 def hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+    """
+    Input format assumption: the RDR/BAF vector is shared across all clones <- using only BAF signals, or running for each initial clone
+    """
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = np.max(prev_assignment) + 1

From 0cdbcc2e1ccc7d85947a48985d5d9ce8555519e0 Mon Sep 17 00:00:00 2001
From: Cong Ma <dorismacong@gmail.com>
Date: Thu, 8 Aug 2024 13:52:14 -0400
Subject: [PATCH 003/125] add versions in setup

---
 setup.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/setup.py b/setup.py
index 6941a67..c447610 100644
--- a/setup.py
+++ b/setup.py
@@ -12,20 +12,20 @@
         long_description='CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data',
         url='https://github.com/raphael-group/CalicoST',
         install_requires=[
-            'numpy', 
-            'scipy', 
-            'pandas',
-            'scikit-learn',
-            'scanpy',
-            'anndata',
-            'numba',
-            'tqdm',
-            'statsmodels',
-            'networkx',
-            'matplotlib',
-            'seaborn',
-            'pysam',
-            'ete3',
+            'numpy=1.24.4', 
+            'scipy=1.11.3', 
+            'pandas=2.1.1',
+            'scikit-learn=1.3.2',
+            'scanpy=1.9.6',
+            'anndata=0.10.3',
+            'numba=0.60.0',
+            'tqdm=4.66.1',
+            'statsmodels=0.14.0',
+            'networkx=3.2.1',
+            'matplotlib=3.7.3',
+            'seaborn=0.12.2',
+            'pysam=0.22.1',
+            'ete3=3.1.3',
             'ipykernel'
         ],
         include_package_data=True

From 0b66b45954c77b1b5149e6dbff4e5d822584d908 Mon Sep 17 00:00:00 2001
From: Cong Ma <dorismacong@gmail.com>
Date: Sun, 11 Aug 2024 08:10:22 -0400
Subject: [PATCH 004/125] black formatted codes

---
 src/calicost/__init__.py                     |    2 +-
 src/calicost/allele_starch_generateconfig.py |  220 +-
 src/calicost/arg_parse.py                    |  356 +-
 src/calicost/calicost_main.py                | 1181 +++++--
 src/calicost/calicost_supervised.py          | 1243 +++++--
 src/calicost/estimate_tumor_proportion.py    |  252 +-
 src/calicost/find_integer_copynumber.py      |  290 +-
 src/calicost/hmm_NB_BB_nophasing.py          |  317 +-
 src/calicost/hmm_NB_BB_nophasing_v2.py       |  396 ++-
 src/calicost/hmm_NB_BB_phaseswitch.py        | 1145 +++++--
 src/calicost/hmm_NB_sharedstates.py          |  164 +-
 src/calicost/hmm_gaussian.py                 |  506 ++-
 src/calicost/hmrf.py                         | 1876 +++++++++--
 src/calicost/hmrf_normalmixture.py           |    1 -
 src/calicost/joint_allele_generateconfig.py  |  229 +-
 src/calicost/oldcode.py                      |  938 ++++--
 src/calicost/parse_input.py                  |  410 ++-
 src/calicost/phasing.py                      |  185 +-
 src/calicost/phylogeny_startle.py            |  170 +-
 src/calicost/phylogeography.py               |  106 +-
 src/calicost/simple_sctransform.py           |  142 +-
 src/calicost/utils_IO.py                     | 1529 +++++++--
 src/calicost/utils_distribution_fitting.py   |  116 +-
 src/calicost/utils_hmm.py                    | 1689 +++++++---
 src/calicost/utils_hmrf.py                   |  609 +++-
 src/calicost/utils_phase_switch.py           |  282 +-
 src/calicost/utils_plotting.py               | 3146 ++++++++++++++----
 27 files changed, 13506 insertions(+), 3994 deletions(-)

diff --git a/src/calicost/__init__.py b/src/calicost/__init__.py
index 4957a9c..992770f 100644
--- a/src/calicost/__init__.py
+++ b/src/calicost/__init__.py
@@ -1 +1 @@
-__version__ = 'v1.0.0'
+__version__ = "v1.0.0"
diff --git a/src/calicost/allele_starch_generateconfig.py b/src/calicost/allele_starch_generateconfig.py
index 3444c14..6320216 100644
--- a/src/calicost/allele_starch_generateconfig.py
+++ b/src/calicost/allele_starch_generateconfig.py
@@ -19,117 +19,119 @@
 def read_configuration_file(filename):
     ##### [Default settings] #####
     config = {
-        "spaceranger_dir" : None,
-        "snp_dir" : None,
-        "output_dir" : None,
+        "spaceranger_dir": None,
+        "snp_dir": None,
+        "output_dir": None,
         # supporting files and preprocessing arguments
-        "hgtable_file" : None,
-        "normalidx_file" : None,
-        "tumorprop_file" : None,
-        "supervision_clone_file" : None,
-        "filtergenelist_file" : None,
-        "filterregion_file" : None,
-        "binsize" : 1,
-        "rdrbinsize" : 1,
+        "hgtable_file": None,
+        "normalidx_file": None,
+        "tumorprop_file": None,
+        "supervision_clone_file": None,
+        "filtergenelist_file": None,
+        "filterregion_file": None,
+        "binsize": 1,
+        "rdrbinsize": 1,
         # "secondbinning_min_umi" : 500,
-        "max_nbins" : 1200,
-        "avg_umi_perbinspot" : 1.5,
-        "bafonly" : True,
+        "max_nbins": 1200,
+        "avg_umi_perbinspot": 1.5,
+        "bafonly": True,
         # phase switch probability
-        "nu" : 1,
-        "logphase_shift" : 1,
-        "npart_phasing" : 2,
+        "nu": 1,
+        "logphase_shift": 1,
+        "npart_phasing": 2,
         # HMRF configurations
-        "n_clones" : None,
-        "n_clones_rdr" : 2,
-        "min_spots_per_clone" : 100,
-        "min_avgumi_per_clone" : 10,
-        "maxspots_pooling" : 7,
-        "tumorprop_threshold" : 0.5, 
-        "max_iter_outer" : 20,
-        "nodepotential" : "max", # max or weighted_sum
-        "initialization_method" : "rectangle", # rectangle or datadrive
-        "num_hmrf_initialization_start" : 0, 
-        "num_hmrf_initialization_end" : 10,
-        "spatial_weight" : 2.0,
-        "construct_adjacency_method" : "hexagon",
-        "construct_adjacency_w" : 1.0,
+        "n_clones": None,
+        "n_clones_rdr": 2,
+        "min_spots_per_clone": 100,
+        "min_avgumi_per_clone": 10,
+        "maxspots_pooling": 7,
+        "tumorprop_threshold": 0.5,
+        "max_iter_outer": 20,
+        "nodepotential": "max",  # max or weighted_sum
+        "initialization_method": "rectangle",  # rectangle or datadrive
+        "num_hmrf_initialization_start": 0,
+        "num_hmrf_initialization_end": 10,
+        "spatial_weight": 2.0,
+        "construct_adjacency_method": "hexagon",
+        "construct_adjacency_w": 1.0,
         # HMM configurations
-        "n_states" : None,
-        "params" : None,
-        "t" : None,
-        "t_phaseing" : 1-1e-4,
-        "fix_NB_dispersion" : False,
-        "shared_NB_dispersion" : True,
-        "fix_BB_dispersion" : False,
-        "shared_BB_dispersion" : True,
-        "max_iter" : 30,
-        "tol" : 1e-3,
-        "gmm_random_state" : 0,
-        "np_threshold" : 2.0,
-        "np_eventminlen" : 10
+        "n_states": None,
+        "params": None,
+        "t": None,
+        "t_phaseing": 1 - 1e-4,
+        "fix_NB_dispersion": False,
+        "shared_NB_dispersion": True,
+        "fix_BB_dispersion": False,
+        "shared_BB_dispersion": True,
+        "max_iter": 30,
+        "tol": 1e-3,
+        "gmm_random_state": 0,
+        "np_threshold": 2.0,
+        "np_eventminlen": 10,
     }
 
     argument_type = {
-        "spaceranger_dir" : "str",
-        "snp_dir" : "str",
-        "output_dir" : "str",
+        "spaceranger_dir": "str",
+        "snp_dir": "str",
+        "output_dir": "str",
         # supporting files and preprocessing arguments
-        "hgtable_file" : "str",
-        "normalidx_file" : "str",
-        "tumorprop_file" : "str",
-        "supervision_clone_file" : "str",
-        "filtergenelist_file" : "str",
-        "filterregion_file" : "str",
-        "binsize" : "int",
-        "rdrbinsize" : "int",
+        "hgtable_file": "str",
+        "normalidx_file": "str",
+        "tumorprop_file": "str",
+        "supervision_clone_file": "str",
+        "filtergenelist_file": "str",
+        "filterregion_file": "str",
+        "binsize": "int",
+        "rdrbinsize": "int",
         # "secondbinning_min_umi" : "int",
-        "max_nbins" : "int",
-        "avg_umi_perbinspot" : "float",
-        "bafonly" : "bool",
+        "max_nbins": "int",
+        "avg_umi_perbinspot": "float",
+        "bafonly": "bool",
         # phase switch probability
-        "nu" : "float",
-        "logphase_shift" : "float",
-        "npart_phasing" : "int",
+        "nu": "float",
+        "logphase_shift": "float",
+        "npart_phasing": "int",
         # HMRF configurations
-        "n_clones" : "int",
-        "n_clones_rdr" : "int",
-        "min_spots_per_clone" : "int",
-        "min_avgumi_per_clone" : "int",
-        "maxspots_pooling" : "int",
-        "tumorprop_threshold" : "float", 
-        "max_iter_outer" : "int",
-        "nodepotential" : "str",
-        "initialization_method" : "str",
-        "num_hmrf_initialization_start" : "int", 
-        "num_hmrf_initialization_end" : "int",
-        "spatial_weight" : "float",
-        "construct_adjacency_method" : "str",
-        "construct_adjacency_w" : "float",
+        "n_clones": "int",
+        "n_clones_rdr": "int",
+        "min_spots_per_clone": "int",
+        "min_avgumi_per_clone": "int",
+        "maxspots_pooling": "int",
+        "tumorprop_threshold": "float",
+        "max_iter_outer": "int",
+        "nodepotential": "str",
+        "initialization_method": "str",
+        "num_hmrf_initialization_start": "int",
+        "num_hmrf_initialization_end": "int",
+        "spatial_weight": "float",
+        "construct_adjacency_method": "str",
+        "construct_adjacency_w": "float",
         # HMM configurations
-        "n_states" : "int",
-        "params" : "str",
-        "t" : "eval",
-        "t_phaseing" : "eval",
-        "fix_NB_dispersion" : "bool",
-        "shared_NB_dispersion" : "bool",
-        "fix_BB_dispersion" : "bool",
-        "shared_BB_dispersion" : "bool",
-        "max_iter" : "int",
-        "tol" : "float",
-        "gmm_random_state" : "int",
-        "np_threshold" : "float",
-        "np_eventminlen" : "int"
+        "n_states": "int",
+        "params": "str",
+        "t": "eval",
+        "t_phaseing": "eval",
+        "fix_NB_dispersion": "bool",
+        "shared_NB_dispersion": "bool",
+        "fix_BB_dispersion": "bool",
+        "shared_BB_dispersion": "bool",
+        "max_iter": "int",
+        "tol": "float",
+        "gmm_random_state": "int",
+        "np_threshold": "float",
+        "np_eventminlen": "int",
     }
 
     ##### [ read configuration file to update settings ] #####
-    with open(filename, 'r') as fp:
+    with open(filename, "r") as fp:
         for line in fp:
             if line.strip() == "" or line[0] == "#":
                 continue
             # strs = [x.replace(" ", "") for x in line.strip().split(":") if x != ""]
             strs = [x.strip() for x in line.strip().split(":") if x != ""]
-            assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
+            assert (
+                strs[0] in config.keys()
+            ), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
             if strs[1].upper() == "NONE":
                 config[strs[0]] = None
             elif argument_type[strs[0]] == "str":
@@ -141,7 +143,7 @@ def read_configuration_file(filename):
             elif argument_type[strs[0]] == "eval":
                 config[strs[0]] = eval(strs[1])
             elif argument_type[strs[0]] == "bool":
-                config[strs[0]] = (strs[1].upper() == "TRUE")
+                config[strs[0]] = strs[1].upper() == "TRUE"
             elif argument_type[strs[0]] == "list_str":
                 config[strs[0]] = strs[1].split(" ")
     # assertions
@@ -153,10 +155,9 @@ def read_configuration_file(filename):
 
 
 def write_config_file(outputfilename, config):
-    list_argument_io = ["spaceranger_dir",
-        "snp_dir",
-        "output_dir"]
-    list_argument_sup = ["hgtable_file",
+    list_argument_io = ["spaceranger_dir", "snp_dir", "output_dir"]
+    list_argument_sup = [
+        "hgtable_file",
         "normalidx_file",
         "tumorprop_file",
         "supervision_clone_file",
@@ -167,11 +168,11 @@ def write_config_file(outputfilename, config):
         # "secondbinning_min_umi",
         "max_nbins",
         "avg_umi_perbinspot",
-        "bafonly"]
-    list_argument_phase = ["nu",
-        "logphase_shift",
-        "npart_phasing"]
-    list_argument_hmrf = ["n_clones",
+        "bafonly",
+    ]
+    list_argument_phase = ["nu", "logphase_shift", "npart_phasing"]
+    list_argument_hmrf = [
+        "n_clones",
         "n_clones_rdr",
         "min_spots_per_clone",
         "min_avgumi_per_clone",
@@ -180,12 +181,14 @@ def write_config_file(outputfilename, config):
         "max_iter_outer",
         "nodepotential",
         "initialization_method",
-        "num_hmrf_initialization_start", 
+        "num_hmrf_initialization_start",
         "num_hmrf_initialization_end",
         "spatial_weight",
         "construct_adjacency_method",
-        "construct_adjacency_w"]
-    list_argument_hmm = ["n_states",
+        "construct_adjacency_w",
+    ]
+    list_argument_hmm = [
+        "n_states",
         "params",
         "t",
         "t_phaseing",
@@ -197,8 +200,9 @@ def write_config_file(outputfilename, config):
         "tol",
         "gmm_random_state",
         "np_threshold",
-        "np_eventminlen"]
-    with open(outputfilename, 'w') as fp:
+        "np_eventminlen",
+    ]
+    with open(outputfilename, "w") as fp:
         #
         for k in list_argument_io:
             fp.write(f"{k} : {config[k]}\n")
@@ -232,10 +236,10 @@ def main(argv):
     config = read_configuration_file(template_configuration_file)
     for r in range(hmrf_seed_s, hmrf_seed_t):
         config["num_hmrf_initialization_start"] = r
-        config["num_hmrf_initialization_end"] = r+1
+        config["num_hmrf_initialization_end"] = r + 1
         write_config_file(f"{outputdir}/configfile{r}", config)
-    
+
 
 if __name__ == "__main__":
     if len(sys.argv) > 1:
-        main(sys.argv)
\ No newline at end of file
+        main(sys.argv)
diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py
index 32f5570..8bf796a 100644
--- a/src/calicost/arg_parse.py
+++ b/src/calicost/arg_parse.py
@@ -3,149 +3,213 @@
 import scipy
 import pandas as pd
 import logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 logger = logging.getLogger()
 
 
 def load_default_config():
-    config_joint = {
-        "input_filelist" : None,
-        "alignment_files" : []
-    }
-    config_single = {
-        "spaceranger_dir" : None
-    }
+    config_joint = {"input_filelist": None, "alignment_files": []}
+    config_single = {"spaceranger_dir": None}
     config_shared = {
-        "snp_dir" : None,
-        "output_dir" : None,
+        "snp_dir": None,
+        "output_dir": None,
         # supporting files and preprocessing arguments
-        "geneticmap_file" : None,
-        "hgtable_file" : None,
-        "normalidx_file" : None,
-        "tumorprop_file" : None,
-        "supervision_clone_file" : None,
-        "filtergenelist_file" : None,
-        "filterregion_file" : None,
-        "secondary_min_umi" : 300,
-        "min_snpumi_perspot" : 50,
-        'min_percent_expressed_spots' : 0.005,
-        "bafonly" : False,
+        "geneticmap_file": None,
+        "hgtable_file": None,
+        "normalidx_file": None,
+        "tumorprop_file": None,
+        "supervision_clone_file": None,
+        "filtergenelist_file": None,
+        "filterregion_file": None,
+        "secondary_min_umi": 300,
+        "min_snpumi_perspot": 50,
+        "min_percent_expressed_spots": 0.005,
+        "bafonly": False,
         # phase switch probability
-        "nu" : 1.0,
-        "logphase_shift" : -2.0,
-        "npart_phasing" : 3,
+        "nu": 1.0,
+        "logphase_shift": -2.0,
+        "npart_phasing": 3,
         # HMRF configurations
-        "n_clones" : None,
-        "n_clones_rdr" : 2,
-        "min_spots_per_clone" : 100,
-        "min_avgumi_per_clone" : 10,
-        "maxspots_pooling" : 7,
-        "tumorprop_threshold" : 0.5, 
-        "max_iter_outer" : 20,
-        "nodepotential" : "weighted_sum", # max or weighted_sum
-        "initialization_method" : "rectangle", # rectangle or datadrive
-        "num_hmrf_initialization_start" : 0, 
-        "num_hmrf_initialization_end" : 10,
-        "spatial_weight" : 1.0,
-        "construct_adjacency_method" : "hexagon",
-        "construct_adjacency_w" : 1.0,
+        "n_clones": None,
+        "n_clones_rdr": 2,
+        "min_spots_per_clone": 100,
+        "min_avgumi_per_clone": 10,
+        "maxspots_pooling": 7,
+        "tumorprop_threshold": 0.5,
+        "max_iter_outer": 20,
+        "nodepotential": "weighted_sum",  # max or weighted_sum
+        "initialization_method": "rectangle",  # rectangle or datadrive
+        "num_hmrf_initialization_start": 0,
+        "num_hmrf_initialization_end": 10,
+        "spatial_weight": 1.0,
+        "construct_adjacency_method": "hexagon",
+        "construct_adjacency_w": 1.0,
         # HMM configurations
-        "n_states" : None,
-        "params" : "smp",
-        "t" : 1-1e-5,
-        "t_phaseing" : 1-1e-4,
-        "fix_NB_dispersion" : False,
-        "shared_NB_dispersion" : True,
-        "fix_BB_dispersion" : False,
-        "shared_BB_dispersion" : True,
-        "max_iter" : 30,
-        "tol" : 1e-4,
-        "gmm_random_state" : 0,
-        "np_threshold" : 1.0,
-        "np_eventminlen" : 10,
+        "n_states": None,
+        "params": "smp",
+        "t": 1 - 1e-5,
+        "t_phaseing": 1 - 1e-4,
+        "fix_NB_dispersion": False,
+        "shared_NB_dispersion": True,
+        "fix_BB_dispersion": False,
+        "shared_BB_dispersion": True,
+        "max_iter": 30,
+        "tol": 1e-4,
+        "gmm_random_state": 0,
+        "np_threshold": 1.0,
+        "np_eventminlen": 10,
         # integer copy number
-        "nonbalance_bafdist" : 1.0,
-        "nondiploid_rdrdist" : 10.0
+        "nonbalance_bafdist": 1.0,
+        "nondiploid_rdrdist": 10.0,
     }
 
-    argtype_joint = {
-        "input_filelist" : "str",
-        "alignment_files" : "list_str"
-    }
-    argtype_single = {
-        "spaceranger_dir" : "str"
-    }
+    argtype_joint = {"input_filelist": "str", "alignment_files": "list_str"}
+    argtype_single = {"spaceranger_dir": "str"}
     argtype_shared = {
-        "snp_dir" : "str",
-        "output_dir" : "str",
+        "snp_dir": "str",
+        "output_dir": "str",
         # supporting files and preprocessing arguments
-        "geneticmap_file" : "str",
-        "hgtable_file" : "str",
-        "normalidx_file" : "str",
-        "tumorprop_file" : "str",
-        "supervision_clone_file" : "str",
-        "filtergenelist_file" : "str",
-        "filterregion_file" : "str",
-        "secondary_min_umi" : "int",
-        "min_snpumi_perspot" : "int",
-        'min_percent_expressed_spots' : "float",
-        "bafonly" : "bool",
+        "geneticmap_file": "str",
+        "hgtable_file": "str",
+        "normalidx_file": "str",
+        "tumorprop_file": "str",
+        "supervision_clone_file": "str",
+        "filtergenelist_file": "str",
+        "filterregion_file": "str",
+        "secondary_min_umi": "int",
+        "min_snpumi_perspot": "int",
+        "min_percent_expressed_spots": "float",
+        "bafonly": "bool",
         # phase switch probability
-        "nu" : "float",
-        "logphase_shift" : "float",
-        "npart_phasing" : "int",
+        "nu": "float",
+        "logphase_shift": "float",
+        "npart_phasing": "int",
         # HMRF configurations
-        "n_clones" : "int",
-        "n_clones_rdr" : "int",
-        "min_spots_per_clone" : "int",
-        "min_avgumi_per_clone" : "int",
-        "maxspots_pooling" : "int",
-        "tumorprop_threshold" : "float", 
-        "max_iter_outer" : "int",
-        "nodepotential" : "str",
-        "initialization_method" : "str",
-        "num_hmrf_initialization_start" : "int", 
-        "num_hmrf_initialization_end" : "int",
-        "spatial_weight" : "float",
-        "construct_adjacency_method" : "str",
-        "construct_adjacency_w" : "float",
+        "n_clones": "int",
+        "n_clones_rdr": "int",
+        "min_spots_per_clone": "int",
+        "min_avgumi_per_clone": "int",
+        "maxspots_pooling": "int",
+        "tumorprop_threshold": "float",
+        "max_iter_outer": "int",
+        "nodepotential": "str",
+        "initialization_method": "str",
+        "num_hmrf_initialization_start": "int",
+        "num_hmrf_initialization_end": "int",
+        "spatial_weight": "float",
+        "construct_adjacency_method": "str",
+        "construct_adjacency_w": "float",
         # HMM configurations
-        "n_states" : "int",
-        "params" : "str",
-        "t" : "eval",
-        "t_phaseing" : "eval",
-        "fix_NB_dispersion" : "bool",
-        "shared_NB_dispersion" : "bool",
-        "fix_BB_dispersion" : "bool",
-        "shared_BB_dispersion" : "bool",
-        "max_iter" : "int",
-        "tol" : "float",
-        "gmm_random_state" : "int",
-        "np_threshold" : "float",
-        "np_eventminlen" : "int",
+        "n_states": "int",
+        "params": "str",
+        "t": "eval",
+        "t_phaseing": "eval",
+        "fix_NB_dispersion": "bool",
+        "shared_NB_dispersion": "bool",
+        "fix_BB_dispersion": "bool",
+        "shared_BB_dispersion": "bool",
+        "max_iter": "int",
+        "tol": "float",
+        "gmm_random_state": "int",
+        "np_threshold": "float",
+        "np_eventminlen": "int",
         # integer copy number
-        "nonbalance_bafdist" : "float",
-        "nondiploid_rdrdist" : "float"
+        "nonbalance_bafdist": "float",
+        "nondiploid_rdrdist": "float",
     }
 
-    category_names = ["", "# supporting files and preprocessing arguments", "# phase switch probability", "# HMRF configurations", "# HMM configurations", "# integer copy number"]
-    category_elements = [["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], \
-                         ["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \
-                         ["nu", "logphase_shift", "npart_phasing"], \
-                         ["n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold",  "max_iter_outer", "nodepotential", "initialization_method", "num_hmrf_initialization_start",  "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", "construct_adjacency_w"], \
-                         ["n_states", "params", "t", "t_phaseing", "fix_NB_dispersion", "shared_NB_dispersion", "fix_BB_dispersion", "shared_BB_dispersion", "max_iter", "tol", "gmm_random_state", "np_threshold", "np_eventminlen"], \
-                         ["nonbalance_bafdist", "nondiploid_rdrdist"]]
-    return config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, category_names, category_elements
+    category_names = [
+        "",
+        "# supporting files and preprocessing arguments",
+        "# phase switch probability",
+        "# HMRF configurations",
+        "# HMM configurations",
+        "# integer copy number",
+    ]
+    category_elements = [
+        ["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"],
+        [
+            "geneticmap_file",
+            "hgtable_file",
+            "normalidx_file",
+            "tumorprop_file",
+            "alignment_files",
+            "supervision_clone_file",
+            "filtergenelist_file",
+            "filterregion_file",
+            "secondary_min_umi",
+            "min_snpumi_perspot",
+            "min_percent_expressed_spots",
+            "bafonly",
+        ],
+        ["nu", "logphase_shift", "npart_phasing"],
+        [
+            "n_clones",
+            "n_clones_rdr",
+            "min_spots_per_clone",
+            "min_avgumi_per_clone",
+            "maxspots_pooling",
+            "tumorprop_threshold",
+            "max_iter_outer",
+            "nodepotential",
+            "initialization_method",
+            "num_hmrf_initialization_start",
+            "num_hmrf_initialization_end",
+            "spatial_weight",
+            "construct_adjacency_method",
+            "construct_adjacency_w",
+        ],
+        [
+            "n_states",
+            "params",
+            "t",
+            "t_phaseing",
+            "fix_NB_dispersion",
+            "shared_NB_dispersion",
+            "fix_BB_dispersion",
+            "shared_BB_dispersion",
+            "max_iter",
+            "tol",
+            "gmm_random_state",
+            "np_threshold",
+            "np_eventminlen",
+        ],
+        ["nonbalance_bafdist", "nondiploid_rdrdist"],
+    ]
+    return (
+        config_shared,
+        config_joint,
+        config_single,
+        argtype_shared,
+        argtype_joint,
+        argtype_single,
+        category_names,
+        category_elements,
+    )
 
 
 def read_configuration_file(filename):
     ##### [Default settings] #####
-    config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
+    (
+        config_shared,
+        config_joint,
+        config_single,
+        argtype_shared,
+        argtype_joint,
+        argtype_single,
+        _,
+        _,
+    ) = load_default_config()
     config = {**config_shared, **config_single}
     argument_type = {**argtype_shared, **argtype_single}
 
     ##### [ read configuration file to update settings ] #####
-    with open(filename, 'r') as fp:
+    with open(filename, "r") as fp:
         for line in fp:
             if line.strip() == "" or line[0] == "#":
                 continue
@@ -153,7 +217,9 @@ def read_configuration_file(filename):
             # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
             if (not strs[0] in config.keys()) and (not strs[0] in config_joint.keys()):
                 # warning that the argument is not a valid configuration parameter and continue
-                logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}")
+                logger.warning(
+                    f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
+                )
                 continue
             if len(strs) == 1:
                 config[strs[0]] = []
@@ -168,7 +234,7 @@ def read_configuration_file(filename):
             elif argument_type[strs[0]] == "eval":
                 config[strs[0]] = eval(strs[1])
             elif argument_type[strs[0]] == "bool":
-                config[strs[0]] = (strs[1].upper() == "TRUE")
+                config[strs[0]] = strs[1].upper() == "TRUE"
             elif argument_type[strs[0]] == "list_str":
                 config[strs[0]] = strs[1].split(" ")
     # assertions
@@ -181,12 +247,21 @@ def read_configuration_file(filename):
 
 def read_joint_configuration_file(filename):
     ##### [Default settings] #####
-    config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
+    (
+        config_shared,
+        config_joint,
+        config_single,
+        argtype_shared,
+        argtype_joint,
+        argtype_single,
+        _,
+        _,
+    ) = load_default_config()
     config = {**config_shared, **config_joint}
     argument_type = {**argtype_shared, **argtype_joint}
 
     ##### [ read configuration file to update settings ] #####
-    with open(filename, 'r') as fp:
+    with open(filename, "r") as fp:
         for line in fp:
             if line.strip() == "" or line[0] == "#":
                 continue
@@ -194,7 +269,9 @@ def read_joint_configuration_file(filename):
             # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
             if (not strs[0] in config.keys()) and (not strs[0] in config_single.keys()):
                 # warning that the argument is not a valid configuration parameter and continue
-                logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}")
+                logger.warning(
+                    f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
+                )
                 continue
             if len(strs) == 1:
                 config[strs[0]] = []
@@ -209,7 +286,7 @@ def read_joint_configuration_file(filename):
             elif argument_type[strs[0]] == "eval":
                 config[strs[0]] = eval(strs[1])
             elif argument_type[strs[0]] == "bool":
-                config[strs[0]] = (strs[1].upper() == "TRUE")
+                config[strs[0]] = strs[1].upper() == "TRUE"
             elif argument_type[strs[0]] == "list_str":
                 config[strs[0]] = strs[1].split(" ")
     # assertions
@@ -221,9 +298,18 @@ def read_joint_configuration_file(filename):
 
 
 def write_config_file(outputfilename, config):
-    _,_,_, argtype_shared, argtype_joint, argtype_single, category_names, category_elements = load_default_config()
+    (
+        _,
+        _,
+        _,
+        argtype_shared,
+        argtype_joint,
+        argtype_single,
+        category_names,
+        category_elements,
+    ) = load_default_config()
     argument_type = {**argtype_shared, **argtype_joint, **argtype_single}
-    with open(outputfilename, 'w') as fp:
+    with open(outputfilename, "w") as fp:
         for i in range(len(category_names)):
             fp.write(f"{category_names[i]}\n")
             for k in category_elements[i]:
@@ -236,13 +322,31 @@ def write_config_file(outputfilename, config):
 
 
 def get_default_config_single():
-    config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
+    (
+        config_shared,
+        config_joint,
+        config_single,
+        argtype_shared,
+        argtype_joint,
+        argtype_single,
+        _,
+        _,
+    ) = load_default_config()
     config = {**config_shared, **config_single}
     return config
 
 
 def get_default_config_joint():
-    config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
+    (
+        config_shared,
+        config_joint,
+        config_single,
+        argtype_shared,
+        argtype_joint,
+        argtype_single,
+        _,
+        _,
+    ) = load_default_config()
     config = {**config_shared, **config_joint}
     return config
 
@@ -259,7 +363,7 @@ def main(argv):
 
     for r in range(hmrf_seed_s, hmrf_seed_t):
         config["num_hmrf_initialization_start"] = r
-        config["num_hmrf_initialization_end"] = r+1
+        config["num_hmrf_initialization_end"] = r + 1
         write_config_file(f"{outputdir}/configfile{r}", config)
 
 
diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 8b144df..d64c102 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -8,7 +8,12 @@
 import scanpy as sc
 import anndata
 import logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 logger = logging.getLogger()
 import copy
 from pathlib import Path
@@ -40,88 +45,225 @@ def main(configuration_file):
     # load data
     ## If the data is loaded for the first time: infer phasing using phase-switch HMM (hmm_NB_BB_phaseswitch.py and phasing.py) -> output initial_phase.npz, matrices in parsed_inputs folder
     ## If the data is already loaded: load the matrices from parsed_inputs folder
-    lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
-        barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config)
-    
+    (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+        df_bininfo,
+        df_gene_snp,
+        barcodes,
+        coords,
+        single_tumor_prop,
+        sample_list,
+        sample_ids,
+        adjacency_mat,
+        smooth_mat,
+        exp_counts,
+    ) = run_parse_n_load(config)
+
     """
     Initial clustering spots using only BAF values.
     """
     # setting transcript count to 0, and baseline so that emission probability calculation will ignore them.
-    copy_single_X_rdr = copy.copy(single_X[:,0,:])
+    copy_single_X_rdr = copy.copy(single_X[:, 0, :])
     copy_single_base_nb_mean = copy.copy(single_base_nb_mean)
-    single_X[:,0,:] = 0
-    single_base_nb_mean[:,:] = 0
-    
+    single_X[:, 0, :] = 0
+    single_base_nb_mean[:, :] = 0
+
     # run HMRF
-    for r_hmrf_initialization in range(config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]):
+    for r_hmrf_initialization in range(
+        config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]
+    ):
         outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}"
         if config["tumorprop_file"] is None:
-            initial_clone_index = rectangle_initialize_initial_clone(coords, config["n_clones"], random_state=r_hmrf_initialization)
+            initial_clone_index = rectangle_initialize_initial_clone(
+                coords, config["n_clones"], random_state=r_hmrf_initialization
+            )
         else:
-            initial_clone_index = rectangle_initialize_initial_clone_mix(coords, config["n_clones"], single_tumor_prop, threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization)
+            initial_clone_index = rectangle_initialize_initial_clone_mix(
+                coords,
+                config["n_clones"],
+                single_tumor_prop,
+                threshold=config["tumorprop_threshold"],
+                random_state=r_hmrf_initialization,
+            )
 
         # create directory
-        p = subprocess.Popen(f"mkdir -p {outdir}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        out,err = p.communicate()
+        p = subprocess.Popen(
+            f"mkdir -p {outdir}",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+        )
+        out, err = p.communicate()
         # save clone initialization into npz file
         prefix = "allspots"
         if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz").exists():
             initial_assignment = np.zeros(single_X.shape[2], dtype=int)
-            for c,idx in enumerate(initial_clone_index):
+            for c, idx in enumerate(initial_clone_index):
                 initial_assignment[idx] = c
-            allres = {"num_iterations":0, "round-1_assignment":initial_assignment}
+            allres = {"num_iterations": 0, "round-1_assignment": initial_assignment}
             np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres)
 
         # run HMRF + HMM
         # store the results of each iteration of HMRF in a npz file outdir/prefix_nstates{config['n_states']}_sp.npz
         # if a specific iteration is computed, hmrf will directly load the results from the file
         if config["tumorprop_file"] is None:
-            hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=config["n_states"], \
-                log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \
-                hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
-                fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"])
+            hmrf_concatenate_pipeline(
+                outdir,
+                prefix,
+                single_X,
+                lengths,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                initial_clone_index,
+                n_states=config["n_states"],
+                log_sitewise_transmat=log_sitewise_transmat,
+                smooth_mat=smooth_mat,
+                adjacency_mat=adjacency_mat,
+                sample_ids=sample_ids,
+                max_iter_outer=config["max_iter_outer"],
+                nodepotential=config["nodepotential"],
+                hmmclass=hmm_nophasing_v2,
+                params="sp",
+                t=config["t"],
+                random_state=config["gmm_random_state"],
+                fix_NB_dispersion=config["fix_NB_dispersion"],
+                shared_NB_dispersion=config["shared_NB_dispersion"],
+                fix_BB_dispersion=config["fix_BB_dispersion"],
+                shared_BB_dispersion=config["shared_BB_dispersion"],
+                is_diag=True,
+                max_iter=config["max_iter"],
+                tol=config["tol"],
+                spatial_weight=config["spatial_weight"],
+            )
         else:
-            hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states=config["n_states"], \
-                log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \
-                hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
-                fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"])
-        
+            hmrfmix_concatenate_pipeline(
+                outdir,
+                prefix,
+                single_X,
+                lengths,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                single_tumor_prop,
+                initial_clone_index,
+                n_states=config["n_states"],
+                log_sitewise_transmat=log_sitewise_transmat,
+                smooth_mat=smooth_mat,
+                adjacency_mat=adjacency_mat,
+                sample_ids=sample_ids,
+                max_iter_outer=config["max_iter_outer"],
+                nodepotential=config["nodepotential"],
+                hmmclass=hmm_nophasing_v2,
+                params="sp",
+                t=config["t"],
+                random_state=config["gmm_random_state"],
+                fix_NB_dispersion=config["fix_NB_dispersion"],
+                shared_NB_dispersion=config["shared_NB_dispersion"],
+                fix_BB_dispersion=config["fix_BB_dispersion"],
+                shared_BB_dispersion=config["shared_BB_dispersion"],
+                is_diag=True,
+                max_iter=config["max_iter"],
+                tol=config["tol"],
+                spatial_weight=config["spatial_weight"],
+                tumorprop_threshold=config["tumorprop_threshold"],
+            )
+
         # merge by thresholding BAF profile similarity
-        res = load_hmrf_last_iteration(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz")
+        res = load_hmrf_last_iteration(
+            f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz"
+        )
         n_obs = single_X.shape[0]
         if config["tumorprop_file"] is None:
-            X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))])
+            X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                [
+                    np.where(res["new_assignment"] == c)[0]
+                    for c in np.sort(np.unique(res["new_assignment"]))
+                ],
+            )
             tumor_prop = None
         else:
-            X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))], single_tumor_prop, threshold=config["tumorprop_threshold"])
-            tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1)
+            X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                [
+                    np.where(res["new_assignment"] == c)[0]
+                    for c in np.sort(np.unique(res["new_assignment"]))
+                ],
+                single_tumor_prop,
+                threshold=config["tumorprop_threshold"],
+            )
+            tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
         # merge "similar" clones from the initial number of clones.
         # "similar" defined by Neyman Pearson statistics/ Likelihood ratios P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B)
-        merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="sp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2)
+        merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(
+            X,
+            base_nb_mean,
+            total_bb_RD,
+            res,
+            threshold=config["np_threshold"],
+            minlength=config["np_eventminlen"],
+            params="sp",
+            tumor_prop=tumor_prop,
+            hmmclass=hmm_nophasing_v2,
+        )
         print(f"BAF clone merging after comparing similarity: {merging_groups}")
         #
         if config["tumorprop_file"] is None:
-            merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs)
+            merging_groups, merged_res = merge_by_minspots(
+                merged_res["new_assignment"],
+                merged_res,
+                single_total_bb_RD,
+                min_spots_thresholds=config["min_spots_per_clone"],
+                min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs,
+            )
         else:
-            merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"])
+            merging_groups, merged_res = merge_by_minspots(
+                merged_res["new_assignment"],
+                merged_res,
+                single_total_bb_RD,
+                min_spots_thresholds=config["min_spots_per_clone"],
+                min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs,
+                single_tumor_prop=single_tumor_prop,
+                threshold=config["tumorprop_threshold"],
+            )
         print(f"BAF clone merging after requiring minimum # spots: {merging_groups}")
         n_baf_clones = len(merging_groups)
-        np.savez(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res)
+        np.savez(
+            f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res
+        )
 
         # load merged results
         n_obs = single_X.shape[0]
-        merged_res = dict(np.load(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", allow_pickle=True))
+        merged_res = dict(
+            np.load(
+                f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz",
+                allow_pickle=True,
+            )
+        )
         merged_baf_assignment = copy.copy(merged_res["new_assignment"])
         n_baf_clones = len(np.unique(merged_baf_assignment))
         pred = np.argmax(merged_res["log_gamma"], axis=0)
-        pred = np.array([ pred[(c*n_obs):(c*n_obs+n_obs)] for c in range(n_baf_clones) ])
-        merged_baf_profiles = np.array([ np.where(pred[c,:] < config["n_states"], merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0], 1-merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0]) \
-                                        for c in range(n_baf_clones) ])
-        
+        pred = np.array(
+            [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)]
+        )
+        merged_baf_profiles = np.array(
+            [
+                np.where(
+                    pred[c, :] < config["n_states"],
+                    merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0],
+                    1 - merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0],
+                )
+                for c in range(n_baf_clones)
+            ]
+        )
+
         """
         Refined clustering using BAF and RDR values.
         """
@@ -129,202 +271,600 @@ def main(configuration_file):
         if not config["bafonly"]:
             # Only used when assuming each spot is pure normal or tumor and if we don't know which spots are normal spots.
             # select normal spots
-            if (config["normalidx_file"] is None) and (config["tumorprop_file"] is None):
+            if (config["normalidx_file"] is None) and (
+                config["tumorprop_file"] is None
+            ):
                 EPS_BAF = 0.05
                 PERCENT_NORMAL = 40
                 vec_stds = np.std(np.log1p(copy_single_X_rdr @ smooth_mat), axis=0)
-                id_nearnormal_clone = np.argmin(np.sum( np.maximum(np.abs(merged_baf_profiles - 0.5)-EPS_BAF, 0), axis=1))
+                id_nearnormal_clone = np.argmin(
+                    np.sum(
+                        np.maximum(np.abs(merged_baf_profiles - 0.5) - EPS_BAF, 0),
+                        axis=1,
+                    )
+                )
                 while True:
-                    stdthreshold = np.percentile(vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], PERCENT_NORMAL)
-                    normal_candidate = (vec_stds < stdthreshold) & (merged_res["new_assignment"] == id_nearnormal_clone)
-                    if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200 or PERCENT_NORMAL == 100:
+                    stdthreshold = np.percentile(
+                        vec_stds[merged_res["new_assignment"] == id_nearnormal_clone],
+                        PERCENT_NORMAL,
+                    )
+                    normal_candidate = (vec_stds < stdthreshold) & (
+                        merged_res["new_assignment"] == id_nearnormal_clone
+                    )
+                    if (
+                        np.sum(copy_single_X_rdr[:, (normal_candidate == True)])
+                        > single_X.shape[0] * 200
+                        or PERCENT_NORMAL == 100
+                    ):
                         break
                     PERCENT_NORMAL += 10
-                pd.Series(barcodes[normal_candidate==True].index).to_csv(f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False)
+                pd.Series(barcodes[normal_candidate == True].index).to_csv(
+                    f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False
+                )
 
-            elif (not config["normalidx_file"] is None):
+            elif not config["normalidx_file"] is None:
                 # single_base_nb_mean has already been added in loading data step.
                 if not config["tumorprop_file"] is None:
-                    logger.warning(f"Mixed sources of information for normal spots! Using {config['normalidx_file']}")
-            
+                    logger.warning(
+                        f"Mixed sources of information for normal spots! Using {config['normalidx_file']}"
+                    )
+
             # If tumor purity is provided, we can use it to select normal spots.
             else:
                 for prop_threshold in np.arange(0.05, 0.6, 0.05):
-                    normal_candidate = (single_tumor_prop < prop_threshold)
-                    if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200:
+                    normal_candidate = single_tumor_prop < prop_threshold
+                    if (
+                        np.sum(copy_single_X_rdr[:, (normal_candidate == True)])
+                        > single_X.shape[0] * 200
+                    ):
                         break
             # To avoid allele-specific expression that are not relevant to CNA, filter bins where normal pseudobulk has large |BAF - 0.5|
             index_normal = np.where(normal_candidate)[0]
-            lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
-                single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file'])
+            (
+                lengths,
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                log_sitewise_transmat,
+                df_gene_snp,
+            ) = bin_selection_basedon_normal(
+                df_gene_snp,
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                config["nu"],
+                config["logphase_shift"],
+                index_normal,
+                config["geneticmap_file"],
+            )
             assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]
             df_bininfo = genesnp_to_bininfo(df_gene_snp)
-            copy_single_X_rdr = copy.copy(single_X[:,0,:])
+            copy_single_X_rdr = copy.copy(single_X[:, 0, :])
 
             # If a gene has way higher expression than adjacent genes, its transcript count will dominate RDR values
             # To avoid the domination, filter out high-UMI DE genes, which may bias RDR estimates
             # Assume the remaining genes will still carry the CNA info.
-            copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=sample_list, sample_ids=sample_ids)
+            copy_single_X_rdr, _ = filter_de_genes_tri(
+                exp_counts,
+                df_bininfo,
+                normal_candidate,
+                sample_list=sample_list,
+                sample_ids=sample_ids,
+            )
             MIN_NORMAL_COUNT_PERBIN = 20
-            bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0]
-            rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1)
+            bidx_inconfident = np.where(
+                np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1)
+                < MIN_NORMAL_COUNT_PERBIN
+            )[0]
+            rdr_normal = np.sum(
+                copy_single_X_rdr[:, (normal_candidate == True)], axis=1
+            )
             rdr_normal[bidx_inconfident] = 0
             rdr_normal = rdr_normal / np.sum(rdr_normal)
-            copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin.
-            copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1)
-                
+            copy_single_X_rdr[bidx_inconfident, :] = (
+                0  # avoid ill-defined distributions if normal has 0 count in that bin.
+            )
+            copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum(
+                copy_single_X_rdr, axis=0
+            ).reshape(1, -1)
+
             # adding back RDR signal
-            single_X[:,0,:] = copy_single_X_rdr
+            single_X[:, 0, :] = copy_single_X_rdr
             single_base_nb_mean = copy_single_base_nb_mean
             n_obs = single_X.shape[0]
 
             # save binned data
-            np.savez(f"{outdir}/binned_data.npz", lengths=lengths, single_X=single_X, single_base_nb_mean=single_base_nb_mean, single_total_bb_RD=single_total_bb_RD, log_sitewise_transmat=log_sitewise_transmat, single_tumor_prop=(None if config["tumorprop_file"] is None else single_tumor_prop))
+            np.savez(
+                f"{outdir}/binned_data.npz",
+                lengths=lengths,
+                single_X=single_X,
+                single_base_nb_mean=single_base_nb_mean,
+                single_total_bb_RD=single_total_bb_RD,
+                log_sitewise_transmat=log_sitewise_transmat,
+                single_tumor_prop=(
+                    None if config["tumorprop_file"] is None else single_tumor_prop
+                ),
+            )
 
             # run HMRF on each clone individually to further split BAF clone by RDR+BAF signal
             for bafc in range(n_baf_clones):
                 prefix = f"clone{bafc}"
                 idx_spots = np.where(merged_baf_assignment == bafc)[0]
-                if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: # put a minimum B allele read count on pseudobulk to split clones
+                if (
+                    np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20
+                ):  # put a minimum B allele read count on pseudobulk to split clones
                     continue
                 # initialize clone
                 # write the initialization in a npz file outdir/prefix_nstates{config['n_states']}_smp.npz
                 if config["tumorprop_file"] is None:
-                    initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], config['n_clones_rdr'], random_state=r_hmrf_initialization)
+                    initial_clone_index = rectangle_initialize_initial_clone(
+                        coords[idx_spots],
+                        config["n_clones_rdr"],
+                        random_state=r_hmrf_initialization,
+                    )
                 else:
-                    initial_clone_index = rectangle_initialize_initial_clone_mix(coords[idx_spots], config['n_clones_rdr'], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization)
-                if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz").exists():
+                    initial_clone_index = rectangle_initialize_initial_clone_mix(
+                        coords[idx_spots],
+                        config["n_clones_rdr"],
+                        single_tumor_prop[idx_spots],
+                        threshold=config["tumorprop_threshold"],
+                        random_state=r_hmrf_initialization,
+                    )
+                if not Path(
+                    f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz"
+                ).exists():
                     initial_assignment = np.zeros(len(idx_spots), dtype=int)
-                    for c,idx in enumerate(initial_clone_index):
+                    for c, idx in enumerate(initial_clone_index):
                         initial_assignment[idx] = c
-                    allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment}
-                    np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", **allres)
-                
+                    allres = {
+                        "barcodes": barcodes[idx_spots],
+                        "num_iterations": 0,
+                        "round-1_assignment": initial_assignment,
+                    }
+                    np.savez(
+                        f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz",
+                        **allres,
+                    )
+
                 # HMRF + HMM using RDR information
                 copy_slice_sample_ids = copy.copy(sample_ids[idx_spots])
                 if config["tumorprop_file"] is None:
-                    hmrf_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=config["n_states"], \
-                        log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \
-                        hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                        fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                        fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                        is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"])
+                    hmrf_concatenate_pipeline(
+                        outdir,
+                        prefix,
+                        single_X[:, :, idx_spots],
+                        lengths,
+                        single_base_nb_mean[:, idx_spots],
+                        single_total_bb_RD[:, idx_spots],
+                        initial_clone_index,
+                        n_states=config["n_states"],
+                        log_sitewise_transmat=log_sitewise_transmat,
+                        smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
+                        adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
+                        sample_ids=copy_slice_sample_ids,
+                        max_iter_outer=10,
+                        nodepotential=config["nodepotential"],
+                        hmmclass=hmm_nophasing_v2,
+                        params="smp",
+                        t=config["t"],
+                        random_state=config["gmm_random_state"],
+                        fix_NB_dispersion=config["fix_NB_dispersion"],
+                        shared_NB_dispersion=config["shared_NB_dispersion"],
+                        fix_BB_dispersion=config["fix_BB_dispersion"],
+                        shared_BB_dispersion=config["shared_BB_dispersion"],
+                        is_diag=True,
+                        max_iter=config["max_iter"],
+                        tol=config["tol"],
+                        spatial_weight=config["spatial_weight"],
+                    )
                 else:
-                    hmrfmix_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], single_tumor_prop[idx_spots], initial_clone_index, n_states=config["n_states"], \
-                        log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \
-                        hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                        fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                        fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                        is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"])
+                    hmrfmix_concatenate_pipeline(
+                        outdir,
+                        prefix,
+                        single_X[:, :, idx_spots],
+                        lengths,
+                        single_base_nb_mean[:, idx_spots],
+                        single_total_bb_RD[:, idx_spots],
+                        single_tumor_prop[idx_spots],
+                        initial_clone_index,
+                        n_states=config["n_states"],
+                        log_sitewise_transmat=log_sitewise_transmat,
+                        smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
+                        adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
+                        sample_ids=copy_slice_sample_ids,
+                        max_iter_outer=10,
+                        nodepotential=config["nodepotential"],
+                        hmmclass=hmm_nophasing_v2,
+                        params="smp",
+                        t=config["t"],
+                        random_state=config["gmm_random_state"],
+                        fix_NB_dispersion=config["fix_NB_dispersion"],
+                        shared_NB_dispersion=config["shared_NB_dispersion"],
+                        fix_BB_dispersion=config["fix_BB_dispersion"],
+                        shared_BB_dispersion=config["shared_BB_dispersion"],
+                        is_diag=True,
+                        max_iter=config["max_iter"],
+                        tol=config["tol"],
+                        spatial_weight=config["spatial_weight"],
+                        tumorprop_threshold=config["tumorprop_threshold"],
+                    )
 
             ##### combine results across clones #####
-            res_combine = {"prev_assignment":np.zeros(single_X.shape[2], dtype=int)}
+            res_combine = {"prev_assignment": np.zeros(single_X.shape[2], dtype=int)}
             offset_clone = 0
             for bafc in range(n_baf_clones):
                 prefix = f"clone{bafc}"
-                allres = dict( np.load(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", allow_pickle=True) )
+                allres = dict(
+                    np.load(
+                        f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz",
+                        allow_pickle=True,
+                    )
+                )
                 r = allres["num_iterations"] - 1
-                res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-                    "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-                    "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-                    "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-                    "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
-                idx_spots = np.where(barcodes.isin( allres["barcodes"] ))[0]
+                res = {
+                    "new_log_mu": allres[f"round{r}_new_log_mu"],
+                    "new_alphas": allres[f"round{r}_new_alphas"],
+                    "new_p_binom": allres[f"round{r}_new_p_binom"],
+                    "new_taus": allres[f"round{r}_new_taus"],
+                    "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+                    "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+                    "log_gamma": allres[f"round{r}_log_gamma"],
+                    "pred_cnv": allres[f"round{r}_pred_cnv"],
+                    "llf": allres[f"round{r}_llf"],
+                    "total_llf": allres[f"round{r}_total_llf"],
+                    "prev_assignment": allres[f"round{r-1}_assignment"],
+                    "new_assignment": allres[f"round{r}_assignment"],
+                }
+                idx_spots = np.where(barcodes.isin(allres["barcodes"]))[0]
                 if len(np.unique(res["new_assignment"])) == 1:
                     n_merged_clones = 1
                     c = res["new_assignment"][0]
                     merged_res = copy.copy(res)
                     merged_res["new_assignment"] = np.zeros(len(idx_spots), dtype=int)
                     try:
-                        log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((2*config["n_states"], n_obs, 1))
+                        log_gamma = res["log_gamma"][
+                            :, (c * n_obs) : (c * n_obs + n_obs)
+                        ].reshape((2 * config["n_states"], n_obs, 1))
                     except:
-                        log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((config["n_states"], n_obs, 1))
-                    pred_cnv = res["pred_cnv"][ (c*n_obs):(c*n_obs+n_obs) ].reshape((-1,1))
+                        log_gamma = res["log_gamma"][
+                            :, (c * n_obs) : (c * n_obs + n_obs)
+                        ].reshape((config["n_states"], n_obs, 1))
+                    pred_cnv = res["pred_cnv"][
+                        (c * n_obs) : (c * n_obs + n_obs)
+                    ].reshape((-1, 1))
                 else:
                     if config["tumorprop_file"] is None:
-                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"])) ])
+                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                            single_X[:, :, idx_spots],
+                            single_base_nb_mean[:, idx_spots],
+                            single_total_bb_RD[:, idx_spots],
+                            [
+                                np.where(res["new_assignment"] == c)[0]
+                                for c in np.sort(np.unique(res["new_assignment"]))
+                            ],
+                        )
                         tumor_prop = None
                     else:
-                        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"])) ], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"])
-                        tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1)
-                    merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="smp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2)
+                        X, base_nb_mean, total_bb_RD, tumor_prop = (
+                            merge_pseudobulk_by_index_mix(
+                                single_X[:, :, idx_spots],
+                                single_base_nb_mean[:, idx_spots],
+                                single_total_bb_RD[:, idx_spots],
+                                [
+                                    np.where(res["new_assignment"] == c)[0]
+                                    for c in np.sort(np.unique(res["new_assignment"]))
+                                ],
+                                single_tumor_prop[idx_spots],
+                                threshold=config["tumorprop_threshold"],
+                            )
+                        )
+                        tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+                    merging_groups, merged_res = (
+                        similarity_components_rdrbaf_neymanpearson(
+                            X,
+                            base_nb_mean,
+                            total_bb_RD,
+                            res,
+                            threshold=config["np_threshold"],
+                            minlength=config["np_eventminlen"],
+                            params="smp",
+                            tumor_prop=tumor_prop,
+                            hmmclass=hmm_nophasing_v2,
+                        )
+                    )
                     print(f"part {bafc} merging_groups: {merging_groups}")
                     #
                     if config["tumorprop_file"] is None:
-                        merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs)
+                        merging_groups, merged_res = merge_by_minspots(
+                            merged_res["new_assignment"],
+                            merged_res,
+                            single_total_bb_RD[:, idx_spots],
+                            min_spots_thresholds=config["min_spots_per_clone"],
+                            min_umicount_thresholds=config["min_avgumi_per_clone"]
+                            * n_obs,
+                        )
                     else:
-                        merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"])
-                    print(f"part {bafc} merging after requiring minimum # spots: {merging_groups}")
+                        merging_groups, merged_res = merge_by_minspots(
+                            merged_res["new_assignment"],
+                            merged_res,
+                            single_total_bb_RD[:, idx_spots],
+                            min_spots_thresholds=config["min_spots_per_clone"],
+                            min_umicount_thresholds=config["min_avgumi_per_clone"]
+                            * n_obs,
+                            single_tumor_prop=single_tumor_prop[idx_spots],
+                            threshold=config["tumorprop_threshold"],
+                        )
+                    print(
+                        f"part {bafc} merging after requiring minimum # spots: {merging_groups}"
+                    )
                     # compute posterior using the newly merged pseudobulk
                     n_merged_clones = len(merging_groups)
                     tmp = copy.copy(merged_res["new_assignment"])
                     if config["tumorprop_file"] is None:
-                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)])
+                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                            single_X[:, :, idx_spots],
+                            single_base_nb_mean[:, idx_spots],
+                            single_total_bb_RD[:, idx_spots],
+                            [
+                                np.where(merged_res["new_assignment"] == c)[0]
+                                for c in range(n_merged_clones)
+                            ],
+                        )
                         tumor_prop = None
                     else:
-                        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"])
+                        X, base_nb_mean, total_bb_RD, tumor_prop = (
+                            merge_pseudobulk_by_index_mix(
+                                single_X[:, :, idx_spots],
+                                single_base_nb_mean[:, idx_spots],
+                                single_total_bb_RD[:, idx_spots],
+                                [
+                                    np.where(merged_res["new_assignment"] == c)[0]
+                                    for c in range(n_merged_clones)
+                                ],
+                                single_tumor_prop[idx_spots],
+                                threshold=config["tumorprop_threshold"],
+                            )
+                        )
                     #
-                    merged_res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), config["n_states"], \
-                            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, \
-                            hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                            fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                            is_diag=True, init_log_mu=res["new_log_mu"], init_p_binom=res["new_p_binom"], init_alphas=res["new_alphas"], init_taus=res["new_taus"], max_iter=config["max_iter"], tol=config["tol"], lambd=np.sum(base_nb_mean,axis=1)/np.sum(base_nb_mean), sample_length=np.ones(X.shape[2],dtype=int)*X.shape[0])
+                    merged_res = pipeline_baum_welch(
+                        None,
+                        np.vstack(
+                            [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]
+                        ).T.reshape(-1, 2, 1),
+                        np.tile(lengths, X.shape[2]),
+                        config["n_states"],
+                        base_nb_mean.flatten("F").reshape(-1, 1),
+                        total_bb_RD.flatten("F").reshape(-1, 1),
+                        np.tile(log_sitewise_transmat, X.shape[2]),
+                        (
+                            np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+                            if not tumor_prop is None
+                            else None
+                        ),
+                        hmmclass=hmm_nophasing_v2,
+                        params="smp",
+                        t=config["t"],
+                        random_state=config["gmm_random_state"],
+                        fix_NB_dispersion=config["fix_NB_dispersion"],
+                        shared_NB_dispersion=config["shared_NB_dispersion"],
+                        fix_BB_dispersion=config["fix_BB_dispersion"],
+                        shared_BB_dispersion=config["shared_BB_dispersion"],
+                        is_diag=True,
+                        init_log_mu=res["new_log_mu"],
+                        init_p_binom=res["new_p_binom"],
+                        init_alphas=res["new_alphas"],
+                        init_taus=res["new_taus"],
+                        max_iter=config["max_iter"],
+                        tol=config["tol"],
+                        lambd=np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean),
+                        sample_length=np.ones(X.shape[2], dtype=int) * X.shape[0],
+                    )
                     merged_res["new_assignment"] = copy.copy(tmp)
-                    merged_res = combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, merged_res, params="smp", tumor_prop=np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, hmmclass=hmm_nophasing_v2, merge_threshold=0.1)
-                    log_gamma = np.stack([ merged_res["log_gamma"][:,(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ], axis=-1)
-                    pred_cnv = np.vstack([ merged_res["pred_cnv"][(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ]).T
+                    merged_res = combine_similar_states_across_clones(
+                        X,
+                        base_nb_mean,
+                        total_bb_RD,
+                        merged_res,
+                        params="smp",
+                        tumor_prop=(
+                            np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+                            if not tumor_prop is None
+                            else None
+                        ),
+                        hmmclass=hmm_nophasing_v2,
+                        merge_threshold=0.1,
+                    )
+                    log_gamma = np.stack(
+                        [
+                            merged_res["log_gamma"][
+                                :, (c * n_obs) : (c * n_obs + n_obs)
+                            ]
+                            for c in range(n_merged_clones)
+                        ],
+                        axis=-1,
+                    )
+                    pred_cnv = np.vstack(
+                        [
+                            merged_res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)]
+                            for c in range(n_merged_clones)
+                        ]
+                    ).T
                 #
                 # add to res_combine
                 if len(res_combine) == 1:
-                    res_combine.update({"new_log_mu":np.hstack([ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([ merged_res["new_alphas"] ] * n_merged_clones), \
-                        "new_p_binom":np.hstack([ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([ merged_res["new_taus"] ] * n_merged_clones), \
-                        "log_gamma":log_gamma, "pred_cnv":pred_cnv})
+                    res_combine.update(
+                        {
+                            "new_log_mu": np.hstack(
+                                [merged_res["new_log_mu"]] * n_merged_clones
+                            ),
+                            "new_alphas": np.hstack(
+                                [merged_res["new_alphas"]] * n_merged_clones
+                            ),
+                            "new_p_binom": np.hstack(
+                                [merged_res["new_p_binom"]] * n_merged_clones
+                            ),
+                            "new_taus": np.hstack(
+                                [merged_res["new_taus"]] * n_merged_clones
+                            ),
+                            "log_gamma": log_gamma,
+                            "pred_cnv": pred_cnv,
+                        }
+                    )
                 else:
-                    res_combine.update({"new_log_mu":np.hstack([res_combine["new_log_mu"]] + [ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([res_combine["new_alphas"]] + [ merged_res["new_alphas"] ] * n_merged_clones), \
-                        "new_p_binom":np.hstack([res_combine["new_p_binom"]] + [ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([res_combine["new_taus"]] + [ merged_res["new_taus"] ] * n_merged_clones), \
-                        "log_gamma":np.dstack([res_combine["log_gamma"], log_gamma ]), "pred_cnv":np.hstack([res_combine["pred_cnv"], pred_cnv])})
-                res_combine["prev_assignment"][idx_spots] = merged_res["new_assignment"] + offset_clone
+                    res_combine.update(
+                        {
+                            "new_log_mu": np.hstack(
+                                [res_combine["new_log_mu"]]
+                                + [merged_res["new_log_mu"]] * n_merged_clones
+                            ),
+                            "new_alphas": np.hstack(
+                                [res_combine["new_alphas"]]
+                                + [merged_res["new_alphas"]] * n_merged_clones
+                            ),
+                            "new_p_binom": np.hstack(
+                                [res_combine["new_p_binom"]]
+                                + [merged_res["new_p_binom"]] * n_merged_clones
+                            ),
+                            "new_taus": np.hstack(
+                                [res_combine["new_taus"]]
+                                + [merged_res["new_taus"]] * n_merged_clones
+                            ),
+                            "log_gamma": np.dstack(
+                                [res_combine["log_gamma"], log_gamma]
+                            ),
+                            "pred_cnv": np.hstack([res_combine["pred_cnv"], pred_cnv]),
+                        }
+                    )
+                res_combine["prev_assignment"][idx_spots] = (
+                    merged_res["new_assignment"] + offset_clone
+                )
                 offset_clone += n_merged_clones
             # temp: make dispersions the same across all clones
-            res_combine["new_alphas"][:,:] = np.max(res_combine["new_alphas"])
-            res_combine["new_taus"][:,:] = np.min(res_combine["new_taus"])
+            res_combine["new_alphas"][:, :] = np.max(res_combine["new_alphas"])
+            res_combine["new_taus"][:, :] = np.min(res_combine["new_taus"])
             # end temp
             n_final_clones = len(np.unique(res_combine["prev_assignment"]))
             # per-sample weights across clones
             log_persample_weights = np.zeros((n_final_clones, len(sample_list)))
             for sidx in range(len(sample_list)):
                 index = np.where(sample_ids == sidx)[0]
-                this_persample_weight = np.bincount(res_combine["prev_assignment"][index], minlength=n_final_clones) / len(index)
-                log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
-                log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
+                this_persample_weight = np.bincount(
+                    res_combine["prev_assignment"][index], minlength=n_final_clones
+                ) / len(index)
+                log_persample_weights[:, sidx] = np.where(
+                    this_persample_weight > 0, np.log(this_persample_weight), -50
+                )
+                log_persample_weights[:, sidx] = log_persample_weights[
+                    :, sidx
+                ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
             # final re-assignment across all clones using estimated RDR + BAF
             # The following step may not be needed because of other improvements. And it may cause mistakes in some cases.
             if config["tumorprop_file"] is None:
                 if config["nodepotential"] == "max":
-                    pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T
-                    new_assignment, single_llf, total_llf, posterior = aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, pred, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    pred = np.vstack(
+                        [
+                            np.argmax(res_combine["log_gamma"][:, :, c], axis=0)
+                            for c in range(res_combine["log_gamma"].shape[2])
+                        ]
+                    ).T
+                    new_assignment, single_llf, total_llf, posterior = (
+                        aggr_hmrf_reassignment(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            res_combine,
+                            pred,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
                 elif config["nodepotential"] == "weighted_sum":
-                    new_assignment, single_llf, total_llf, posterior = hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    new_assignment, single_llf, total_llf, posterior = (
+                        hmrf_reassignment_posterior(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            res_combine,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
             else:
                 if config["nodepotential"] == "max":
-                    pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T
-                    new_assignment, single_llf, total_llf, posterior = aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, pred, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    pred = np.vstack(
+                        [
+                            np.argmax(res_combine["log_gamma"][:, :, c], axis=0)
+                            for c in range(res_combine["log_gamma"].shape[2])
+                        ]
+                    ).T
+                    new_assignment, single_llf, total_llf, posterior = (
+                        aggr_hmrfmix_reassignment(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            single_tumor_prop,
+                            res_combine,
+                            pred,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
                 elif config["nodepotential"] == "weighted_sum":
-                    new_assignment, single_llf, total_llf, posterior = hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    new_assignment, single_llf, total_llf, posterior = (
+                        hmrfmix_reassignment_posterior(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            single_tumor_prop,
+                            res_combine,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
             res_combine["total_llf"] = total_llf
             res_combine["new_assignment"] = new_assignment
             # re-order clones such that normal clones are always clone 0
-            res_combine, posterior = reorder_results(res_combine, posterior, single_tumor_prop)
+            res_combine, posterior = reorder_results(
+                res_combine, posterior, single_tumor_prop
+            )
             # save results
-            np.savez(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine)
+            np.savez(
+                f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+                **res_combine,
+            )
             np.save(f"{outdir}/posterior_clone_probability.npy", posterior)
-            
+
             ##### infer integer copy #####
-            res_combine = dict(np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True))
+            res_combine = dict(
+                np.load(
+                    f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+                    allow_pickle=True,
+                )
+            )
             final_clone_ids = np.sort(np.unique(res_combine["new_assignment"]))
             nonempty_clone_ids = copy.copy(final_clone_ids)
             # add clone 0 as normal clone if it doesn't appear in final_clone_ids
@@ -332,7 +872,7 @@ def main(configuration_file):
                 final_clone_ids = np.append(0, final_clone_ids)
             # chr position
             medfix = ["", "_diploid", "_triploid", "_tetraploid"]
-            for o,max_medploidy in enumerate([None, 2, 3, 4]):
+            for o, max_medploidy in enumerate([None, 2, 3, 4]):
                 # A/B copy number per bin
                 allele_specific_copy = []
                 # A/B copy number per state
@@ -340,61 +880,204 @@ def main(configuration_file):
 
                 df_genelevel_cnv = None
                 if config["tumorprop_file"] is None:
-                    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids])
+                    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                        single_X,
+                        single_base_nb_mean,
+                        single_total_bb_RD,
+                        [
+                            np.where(res_combine["new_assignment"] == cid)[0]
+                            for cid in final_clone_ids
+                        ],
+                    )
                 else:
-                    X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids], single_tumor_prop, threshold=config["tumorprop_threshold"])
+                    X, base_nb_mean, total_bb_RD, tumor_prop = (
+                        merge_pseudobulk_by_index_mix(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            [
+                                np.where(res_combine["new_assignment"] == cid)[0]
+                                for cid in final_clone_ids
+                            ],
+                            single_tumor_prop,
+                            threshold=config["tumorprop_threshold"],
+                        )
+                    )
 
                 for s, cid in enumerate(final_clone_ids):
-                    if np.sum(base_nb_mean[:,s]) == 0:
+                    if np.sum(base_nb_mean[:, s]) == 0:
                         continue
                     # adjust log_mu such that sum_bin lambda * np.exp(log_mu) = 1
-                    lambd = base_nb_mean[:,s] / np.sum(base_nb_mean[:,s])
-                    this_pred_cnv = res_combine["pred_cnv"][:,s]
-                    adjusted_log_mu = np.log( np.exp(res_combine["new_log_mu"][:,s]) / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv,s]) * lambd) )
+                    lambd = base_nb_mean[:, s] / np.sum(base_nb_mean[:, s])
+                    this_pred_cnv = res_combine["pred_cnv"][:, s]
+                    adjusted_log_mu = np.log(
+                        np.exp(res_combine["new_log_mu"][:, s])
+                        / np.sum(
+                            np.exp(res_combine["new_log_mu"][this_pred_cnv, s]) * lambd
+                        )
+                    )
                     if not max_medploidy is None:
-                        best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, max_medploidy=max_medploidy)
+                        best_integer_copies, _ = (
+                            hill_climbing_integer_copynumber_oneclone(
+                                adjusted_log_mu,
+                                base_nb_mean[:, s],
+                                res_combine["new_p_binom"][:, s],
+                                this_pred_cnv,
+                                max_medploidy=max_medploidy,
+                            )
+                        )
                     else:
                         try:
-                            best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"])
+                            best_integer_copies, _ = (
+                                hill_climbing_integer_copynumber_fixdiploid(
+                                    adjusted_log_mu,
+                                    base_nb_mean[:, s],
+                                    res_combine["new_p_binom"][:, s],
+                                    this_pred_cnv,
+                                    nonbalance_bafdist=config["nonbalance_bafdist"],
+                                    nondiploid_rdrdist=config["nondiploid_rdrdist"],
+                                )
+                            )
                         except:
                             try:
-                                best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"], min_prop_threshold=0.02)
+                                best_integer_copies, _ = (
+                                    hill_climbing_integer_copynumber_fixdiploid(
+                                        adjusted_log_mu,
+                                        base_nb_mean[:, s],
+                                        res_combine["new_p_binom"][:, s],
+                                        this_pred_cnv,
+                                        nonbalance_bafdist=config["nonbalance_bafdist"],
+                                        nondiploid_rdrdist=config["nondiploid_rdrdist"],
+                                        min_prop_threshold=0.02,
+                                    )
+                                )
                             except:
                                 finding_distate_failed = True
                                 continue
 
-                    print(f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}")
+                    print(
+                        f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}"
+                    )
                     #
-                    allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 0].reshape(1,-1), index=[f"clone{cid} A"], columns=np.arange(n_obs) ) )
-                    allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 1].reshape(1,-1), index=[f"clone{cid} B"], columns=np.arange(n_obs) ) )
+                    allele_specific_copy.append(
+                        pd.DataFrame(
+                            best_integer_copies[
+                                res_combine["pred_cnv"][:, s], 0
+                            ].reshape(1, -1),
+                            index=[f"clone{cid} A"],
+                            columns=np.arange(n_obs),
+                        )
+                    )
+                    allele_specific_copy.append(
+                        pd.DataFrame(
+                            best_integer_copies[
+                                res_combine["pred_cnv"][:, s], 1
+                            ].reshape(1, -1),
+                            index=[f"clone{cid} B"],
+                            columns=np.arange(n_obs),
+                        )
+                    )
                     #
-                    state_cnv.append( pd.DataFrame( res_combine["new_log_mu"][:,s].reshape(-1,1), columns=[f"clone{cid} logmu"], index=np.arange(config['n_states']) ) )
-                    state_cnv.append( pd.DataFrame( res_combine["new_p_binom"][:,s].reshape(-1,1), columns=[f"clone{cid} p"], index=np.arange(config['n_states']) ) )
-                    state_cnv.append( pd.DataFrame( best_integer_copies[:,0].reshape(-1,1), columns=[f"clone{cid} A"], index=np.arange(config['n_states']) ) )
-                    state_cnv.append( pd.DataFrame( best_integer_copies[:,1].reshape(-1,1), columns=[f"clone{cid} B"], index=np.arange(config['n_states']) ) )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            res_combine["new_log_mu"][:, s].reshape(-1, 1),
+                            columns=[f"clone{cid} logmu"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            res_combine["new_p_binom"][:, s].reshape(-1, 1),
+                            columns=[f"clone{cid} p"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            best_integer_copies[:, 0].reshape(-1, 1),
+                            columns=[f"clone{cid} A"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            best_integer_copies[:, 1].reshape(-1, 1),
+                            columns=[f"clone{cid} B"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
                     #
                     # tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list)
                     # tmpdf.columns = [f"clone{s} A", f"clone{s} B"]
-                    bin_Acopy_mappers = {i:x for i,x in enumerate(best_integer_copies[res_combine["pred_cnv"][:,s], 0])}
-                    bin_Bcopy_mappers = {i:x for i,x in enumerate(best_integer_copies[res_combine["pred_cnv"][:,s], 1])}
-                    tmpdf = pd.DataFrame({"gene":df_gene_snp[df_gene_snp.is_interval].gene, f"clone{s} A":df_gene_snp[df_gene_snp.is_interval]['bin_id'].map(bin_Acopy_mappers), \
-                        f"clone{s} B":df_gene_snp[df_gene_snp.is_interval]['bin_id'].map(bin_Bcopy_mappers)}).set_index('gene')
+                    bin_Acopy_mappers = {
+                        i: x
+                        for i, x in enumerate(
+                            best_integer_copies[res_combine["pred_cnv"][:, s], 0]
+                        )
+                    }
+                    bin_Bcopy_mappers = {
+                        i: x
+                        for i, x in enumerate(
+                            best_integer_copies[res_combine["pred_cnv"][:, s], 1]
+                        )
+                    }
+                    tmpdf = pd.DataFrame(
+                        {
+                            "gene": df_gene_snp[df_gene_snp.is_interval].gene,
+                            f"clone{s} A": df_gene_snp[df_gene_snp.is_interval][
+                                "bin_id"
+                            ].map(bin_Acopy_mappers),
+                            f"clone{s} B": df_gene_snp[df_gene_snp.is_interval][
+                                "bin_id"
+                            ].map(bin_Bcopy_mappers),
+                        }
+                    ).set_index("gene")
                     if df_genelevel_cnv is None:
-                        df_genelevel_cnv = copy.copy( tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int) )
+                        df_genelevel_cnv = copy.copy(
+                            tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int)
+                        )
                     else:
-                        df_genelevel_cnv = df_genelevel_cnv.join( tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int) )
+                        df_genelevel_cnv = df_genelevel_cnv.join(
+                            tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int)
+                        )
                 if len(state_cnv) == 0:
                     continue
                 # output gene-level copy number
-                df_genelevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t")
+                df_genelevel_cnv.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_genelevel.tsv",
+                    header=True,
+                    index=True,
+                    sep="\t",
+                )
                 # output segment-level copy number
                 allele_specific_copy = pd.concat(allele_specific_copy)
-                df_seglevel_cnv = pd.DataFrame({"CHR":df_bininfo.CHR.values, "START":df_bininfo.START.values, "END":df_bininfo.END.values })
-                df_seglevel_cnv = df_seglevel_cnv.join( allele_specific_copy.T )
-                df_seglevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t")
+                df_seglevel_cnv = pd.DataFrame(
+                    {
+                        "CHR": df_bininfo.CHR.values,
+                        "START": df_bininfo.START.values,
+                        "END": df_bininfo.END.values,
+                    }
+                )
+                df_seglevel_cnv = df_seglevel_cnv.join(allele_specific_copy.T)
+                df_seglevel_cnv.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_seglevel.tsv",
+                    header=True,
+                    index=False,
+                    sep="\t",
+                )
                 # output per-state copy number
-                state_cnv = functools.reduce(lambda left,right: pd.merge(left,right, left_index=True, right_index=True, how='inner'), state_cnv)
-                state_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t")
+                state_cnv = functools.reduce(
+                    lambda left, right: pd.merge(
+                        left, right, left_index=True, right_index=True, how="inner"
+                    ),
+                    state_cnv,
+                )
+                state_cnv.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_perstate.tsv",
+                    header=True,
+                    index=False,
+                    sep="\t",
+                )
                 # #
                 # # posterior using integer-copy numbers
                 # log_persample_weights = np.zeros((len(nonempty_clone_ids), len(sample_list)))
@@ -407,12 +1090,16 @@ def main(configuration_file):
                 # df_posterior = clonelabel_posterior_withinteger(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, state_cnv, res_combine, pred, \
                 #     smooth_mat, adjacency_mat, res_combine["new_assignment"], sample_ids, base_nb_mean, log_persample_weights, config["spatial_weight"], hmmclass=hmm_nophasing_v2)
                 # df_posterior.to_pickle(f"{outdir}/posterior{medfix[o]}.pkl")
-            
+
             ##### output clone label #####
-            df_clone_label = pd.DataFrame({"clone_label":res_combine["new_assignment"]}, index=barcodes)
+            df_clone_label = pd.DataFrame(
+                {"clone_label": res_combine["new_assignment"]}, index=barcodes
+            )
             if not config["tumorprop_file"] is None:
                 df_clone_label["tumor_proportion"] = single_tumor_prop
-            df_clone_label.to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t")
+            df_clone_label.to_csv(
+                f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t"
+            )
 
             ##### plotting #####
             # make a directory for plots
@@ -421,48 +1108,158 @@ def main(configuration_file):
 
             # plot RDR and BAF
             cn_file = f"{outdir}/cnv_diploid_seglevel.tsv"
-            fig = plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=30, palette="tab10")
-            fig.savefig(f"{outdir}/plots/rdr_baf_defaultcolor.pdf", transparent=True, bbox_inches="tight")
+            fig = plot_rdr_baf(
+                configuration_file,
+                r_hmrf_initialization,
+                cn_file,
+                clone_ids=None,
+                remove_xticks=True,
+                rdr_ylim=5,
+                chrtext_shift=-0.3,
+                base_height=3.2,
+                pointsize=30,
+                palette="tab10",
+            )
+            fig.savefig(
+                f"{outdir}/plots/rdr_baf_defaultcolor.pdf",
+                transparent=True,
+                bbox_inches="tight",
+            )
             # plot allele-specific copy number
-            for o,max_medploidy in enumerate([None, 2, 3, 4]):
+            for o, max_medploidy in enumerate([None, 2, 3, 4]):
                 cn_file = f"{outdir}/cnv{medfix[o]}_seglevel.tsv"
                 if not Path(cn_file).exists():
                     continue
                 df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
                 df_cnv = expand_df_cnv(df_cnv)
-                df_cnv = df_cnv[~df_cnv.iloc[:,-1].isnull()]
-                fig, axes = plt.subplots(1, 1, figsize=(15, 0.9*len(final_clone_ids) + 0.6), dpi=200, facecolor="white")
-                axes = plot_acn_from_df_anotherscheme(df_cnv, axes, chrbar_pos='top', chrbar_thickness=0.3, add_legend=False, remove_xticks=True)
+                df_cnv = df_cnv[~df_cnv.iloc[:, -1].isnull()]
+                fig, axes = plt.subplots(
+                    1,
+                    1,
+                    figsize=(15, 0.9 * len(final_clone_ids) + 0.6),
+                    dpi=200,
+                    facecolor="white",
+                )
+                axes = plot_acn_from_df_anotherscheme(
+                    df_cnv,
+                    axes,
+                    chrbar_pos="top",
+                    chrbar_thickness=0.3,
+                    add_legend=False,
+                    remove_xticks=True,
+                )
                 fig.tight_layout()
-                fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}.pdf", transparent=True, bbox_inches="tight")
+                fig.savefig(
+                    f"{outdir}/plots/acn_genome{medfix[o]}.pdf",
+                    transparent=True,
+                    bbox_inches="tight",
+                )
                 # additionally plot the allele-specific copy number per region
                 if not config["supervision_clone_file"] is None:
-                    fig, axes = plt.subplots(1, 1, figsize=(15, 0.6*len(unique_clone_ids) + 0.4), dpi=200, facecolor="white")
+                    fig, axes = plt.subplots(
+                        1,
+                        1,
+                        figsize=(15, 0.6 * len(unique_clone_ids) + 0.4),
+                        dpi=200,
+                        facecolor="white",
+                    )
                     merged_df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
                     df_cnv = merged_df_cnv[["CHR", "START", "END"]]
-                    df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} A":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} A"] for i,x in enumerate(unique_clone_ids)}) )
-                    df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} B":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} B"] for i,x in enumerate(unique_clone_ids)}) )
+                    df_cnv = df_cnv.join(
+                        pd.DataFrame(
+                            {
+                                f"clone{x} A": merged_df_cnv[
+                                    f"clone{res_combine['new_assignment'][i]} A"
+                                ]
+                                for i, x in enumerate(unique_clone_ids)
+                            }
+                        )
+                    )
+                    df_cnv = df_cnv.join(
+                        pd.DataFrame(
+                            {
+                                f"clone{x} B": merged_df_cnv[
+                                    f"clone{res_combine['new_assignment'][i]} B"
+                                ]
+                                for i, x in enumerate(unique_clone_ids)
+                            }
+                        )
+                    )
                     df_cnv = expand_df_cnv(df_cnv)
-                    clone_ids = np.concatenate([ unique_clone_ids[res_combine["new_assignment"]==c].astype(str) for c in final_clone_ids ])
-                    axes = plot_acn_from_df(df_cnv, axes, clone_ids=clone_ids, clone_names=[f"region {x}" for x in clone_ids], add_chrbar=True, add_arrow=False, chrbar_thickness=0.4/(0.6*len(unique_clone_ids) + 0.4), add_legend=True, remove_xticks=True)
+                    clone_ids = np.concatenate(
+                        [
+                            unique_clone_ids[res_combine["new_assignment"] == c].astype(
+                                str
+                            )
+                            for c in final_clone_ids
+                        ]
+                    )
+                    axes = plot_acn_from_df(
+                        df_cnv,
+                        axes,
+                        clone_ids=clone_ids,
+                        clone_names=[f"region {x}" for x in clone_ids],
+                        add_chrbar=True,
+                        add_arrow=False,
+                        chrbar_thickness=0.4 / (0.6 * len(unique_clone_ids) + 0.4),
+                        add_legend=True,
+                        remove_xticks=True,
+                    )
                     fig.tight_layout()
-                    fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf", transparent=True, bbox_inches="tight")
+                    fig.savefig(
+                        f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf",
+                        transparent=True,
+                        bbox_inches="tight",
+                    )
             # plot clones in space
             if not config["supervision_clone_file"] is None:
                 before_assignments = pd.Series([None] * before_coords.shape[0])
-                for i,c in enumerate(unique_clone_ids):
-                    before_assignments.iloc[before_df_clones.clone_id.isin([c])] = f"clone {res_combine['new_assignment'][i]}"
-                fig = plot_clones_in_space(before_coords, before_assignments, sample_list, before_sample_ids, palette="Set2", labels=unique_clone_ids, label_coords=coords, label_sample_ids=sample_ids)
-                fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight")
+                for i, c in enumerate(unique_clone_ids):
+                    before_assignments.iloc[before_df_clones.clone_id.isin([c])] = (
+                        f"clone {res_combine['new_assignment'][i]}"
+                    )
+                fig = plot_clones_in_space(
+                    before_coords,
+                    before_assignments,
+                    sample_list,
+                    before_sample_ids,
+                    palette="Set2",
+                    labels=unique_clone_ids,
+                    label_coords=coords,
+                    label_sample_ids=sample_ids,
+                )
+                fig.savefig(
+                    f"{outdir}/plots/clone_spatial.pdf",
+                    transparent=True,
+                    bbox_inches="tight",
+                )
             else:
-                assignment = pd.Series([f"clone {x}" for x in res_combine["new_assignment"]])
-                fig = plot_individual_spots_in_space(coords, assignment, single_tumor_prop, sample_list=sample_list, sample_ids=sample_ids)
-                fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight")
-                
+                assignment = pd.Series(
+                    [f"clone {x}" for x in res_combine["new_assignment"]]
+                )
+                fig = plot_individual_spots_in_space(
+                    coords,
+                    assignment,
+                    single_tumor_prop,
+                    sample_list=sample_list,
+                    sample_ids=sample_ids,
+                )
+                fig.savefig(
+                    f"{outdir}/plots/clone_spatial.pdf",
+                    transparent=True,
+                    bbox_inches="tight",
+                )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
+    parser.add_argument(
+        "-c",
+        "--configfile",
+        help="configuration file of CalicoST",
+        required=True,
+        type=str,
+    )
     args = parser.parse_args()
 
-    main(args.configfile)
\ No newline at end of file
+    main(args.configfile)
diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py
index d872cae..a881fff 100644
--- a/src/calicost/calicost_supervised.py
+++ b/src/calicost/calicost_supervised.py
@@ -8,7 +8,12 @@
 import scanpy as sc
 import anndata
 import logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 logger = logging.getLogger()
 import copy
 from pathlib import Path
@@ -29,9 +34,11 @@
 from matplotlib.lines import Line2D
 import matplotlib.patches as mpatches
 import seaborn
-plt.rcParams.update({'font.size': 14})
+
+plt.rcParams.update({"font.size": 14})
 
 import mkl
+
 mkl.set_num_threads(1)
 
 
@@ -44,125 +51,304 @@ def main(configuration_file):
     for k in sorted(list(config.keys())):
         print(f"\t{k} : {config[k]}")
 
-    lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, x_gene_list, \
-        barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config)
-    
+    (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+        df_bininfo,
+        x_gene_list,
+        barcodes,
+        coords,
+        single_tumor_prop,
+        sample_list,
+        sample_ids,
+        adjacency_mat,
+        smooth_mat,
+        exp_counts,
+    ) = run_parse_n_load(config)
+
     # normal baseline expression if tumorprop_file is provided
     if not config["tumorprop_file"] is None:
         EXPECTED_NORMAL_PROP = 0.05
-        q = np.sort(single_tumor_prop)[ int(EXPECTED_NORMAL_PROP * len(barcodes)) ]
-        normal_candidate = ( single_tumor_prop <= q )
-        
+        q = np.sort(single_tumor_prop)[int(EXPECTED_NORMAL_PROP * len(barcodes))]
+        normal_candidate = single_tumor_prop <= q
+
         # copy_single_X_rdr,_ = filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=sample_list, sample_ids=sample_ids)
-        copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, x_gene_list, normal_candidate, sample_list=sample_list, sample_ids=sample_ids)
+        copy_single_X_rdr, _ = filter_de_genes_tri(
+            exp_counts,
+            x_gene_list,
+            normal_candidate,
+            sample_list=sample_list,
+            sample_ids=sample_ids,
+        )
         MIN_NORMAL_COUNT_PERBIN = 20
-        bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0]
-        rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1)
+        bidx_inconfident = np.where(
+            np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1)
+            < MIN_NORMAL_COUNT_PERBIN
+        )[0]
+        rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1)
         rdr_normal[bidx_inconfident] = 0
         rdr_normal = rdr_normal / np.sum(rdr_normal)
-        copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin.
-        copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1)
+        copy_single_X_rdr[bidx_inconfident, :] = (
+            0  # avoid ill-defined distributions if normal has 0 count in that bin.
+        )
+        copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum(
+            copy_single_X_rdr, axis=0
+        ).reshape(1, -1)
         # adding back RDR signal
-        single_X[:,0,:] = copy_single_X_rdr
+        single_X[:, 0, :] = copy_single_X_rdr
         single_base_nb_mean = copy_single_base_nb_mean
 
     # make each cluster in supervision_clone_file a pseudospot
     if not config["supervision_clone_file"] is None:
-        tmp_df_clones = pd.read_csv(config["supervision_clone_file"], header=0, index_col=0, sep="\t")
-        df_clones = pd.DataFrame({"barcodes":barcodes.values}, index=barcodes.values).join(tmp_df_clones)
+        tmp_df_clones = pd.read_csv(
+            config["supervision_clone_file"], header=0, index_col=0, sep="\t"
+        )
+        df_clones = pd.DataFrame(
+            {"barcodes": barcodes.values}, index=barcodes.values
+        ).join(tmp_df_clones)
         df_clones.columns = ["barcodes", "clone_id"]
-        
-        unique_clone_ids = np.unique( df_clones["clone_id"][~df_clones["clone_id"].isnull()].values )
-        clone_index = [np.where(df_clones["clone_id"] == c)[0] for c in unique_clone_ids]
+
+        unique_clone_ids = np.unique(
+            df_clones["clone_id"][~df_clones["clone_id"].isnull()].values
+        )
+        clone_index = [
+            np.where(df_clones["clone_id"] == c)[0] for c in unique_clone_ids
+        ]
         if config["tumorprop_file"] is None:
-            single_X, single_base_nb_mean, single_total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index)
+            single_X, single_base_nb_mean, single_total_bb_RD = (
+                merge_pseudobulk_by_index(
+                    single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+                )
+            )
             single_tumor_prop = None
         else:
-            single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=config["tumorprop_threshold"])
+            single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop = (
+                merge_pseudobulk_by_index_mix(
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    clone_index,
+                    single_tumor_prop,
+                    threshold=config["tumorprop_threshold"],
+                )
+            )
         before_coords = copy.copy(coords)
         before_df_clones = copy.copy(df_clones)
         before_sample_ids = copy.copy(sample_ids)
-        coords = np.array([ np.mean(coords[idx,:],axis=0) for idx in clone_index ])
+        coords = np.array([np.mean(coords[idx, :], axis=0) for idx in clone_index])
         smooth_mat = scipy.sparse.csr_matrix(np.eye(coords.shape[0]))
         adjacency_mat = scipy.sparse.csr_matrix(np.eye(coords.shape[0]))
         barcodes = pd.Series(unique_clone_ids)
         sample_ids = np.array([sample_ids[idx][0] for idx in clone_index])
 
     # clear values in RDR to first infer clones using BAF signal only
-    copy_single_X_rdr = copy.copy(single_X[:,0,:])
+    copy_single_X_rdr = copy.copy(single_X[:, 0, :])
     copy_single_base_nb_mean = copy.copy(single_base_nb_mean)
-    single_X[:,0,:] = 0
-    single_base_nb_mean[:,:] = 0
-    
+    single_X[:, 0, :] = 0
+    single_base_nb_mean[:, :] = 0
+
     # run HMRF
-    for r_hmrf_initialization in range(config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]):
+    for r_hmrf_initialization in range(
+        config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]
+    ):
         outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}"
         if config["initialization_method"] == "rectangle":
             if config["tumorprop_file"] is None:
-                initial_clone_index = rectangle_initialize_initial_clone(coords, min(coords.shape[0],config["n_clones"]), random_state=r_hmrf_initialization)
+                initial_clone_index = rectangle_initialize_initial_clone(
+                    coords,
+                    min(coords.shape[0], config["n_clones"]),
+                    random_state=r_hmrf_initialization,
+                )
             else:
-                initial_clone_index = rectangle_initialize_initial_clone_mix(coords, min(coords.shape[0],config["n_clones"]), single_tumor_prop, threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization)
+                initial_clone_index = rectangle_initialize_initial_clone_mix(
+                    coords,
+                    min(coords.shape[0], config["n_clones"]),
+                    single_tumor_prop,
+                    threshold=config["tumorprop_threshold"],
+                    random_state=r_hmrf_initialization,
+                )
         else:
-            kmeans = KMeans(n_clusters = config["n_clones"], max_iter=1, init="random", random_state=config["num_hmrf_initialization_start"]).fit(coords)
-            initial_clone_index = [np.where(kmeans.labels_ == i)[0] for i in range(config["n_clones"])]
+            kmeans = KMeans(
+                n_clusters=config["n_clones"],
+                max_iter=1,
+                init="random",
+                random_state=config["num_hmrf_initialization_start"],
+            ).fit(coords)
+            initial_clone_index = [
+                np.where(kmeans.labels_ == i)[0] for i in range(config["n_clones"])
+            ]
 
         # create directory
-        p = subprocess.Popen(f"mkdir -p {outdir}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        out,err = p.communicate()
+        p = subprocess.Popen(
+            f"mkdir -p {outdir}",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+        )
+        out, err = p.communicate()
         # save clone initialization into npz file
         prefix = "allspots"
         if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz").exists():
             initial_assignment = np.zeros(single_X.shape[2], dtype=int)
-            for c,idx in enumerate(initial_clone_index):
+            for c, idx in enumerate(initial_clone_index):
                 initial_assignment[idx] = c
-            allres = {"num_iterations":0, "round-1_assignment":initial_assignment}
+            allres = {"num_iterations": 0, "round-1_assignment": initial_assignment}
             np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres)
 
         # run HMRF + HMM
         if config["tumorprop_file"] is None:
-            hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=config["n_states"], \
-                log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \
-                hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
-                fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"])
+            hmrf_concatenate_pipeline(
+                outdir,
+                prefix,
+                single_X,
+                lengths,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                initial_clone_index,
+                n_states=config["n_states"],
+                log_sitewise_transmat=log_sitewise_transmat,
+                smooth_mat=smooth_mat,
+                adjacency_mat=adjacency_mat,
+                sample_ids=sample_ids,
+                max_iter_outer=config["max_iter_outer"],
+                nodepotential=config["nodepotential"],
+                hmmclass=hmm_nophasing_v2,
+                params="sp",
+                t=config["t"],
+                random_state=config["gmm_random_state"],
+                fix_NB_dispersion=config["fix_NB_dispersion"],
+                shared_NB_dispersion=config["shared_NB_dispersion"],
+                fix_BB_dispersion=config["fix_BB_dispersion"],
+                shared_BB_dispersion=config["shared_BB_dispersion"],
+                is_diag=True,
+                max_iter=config["max_iter"],
+                tol=config["tol"],
+                spatial_weight=config["spatial_weight"],
+            )
         else:
-            hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states=config["n_states"], \
-                log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \
-                hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
-                fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"])
-        
+            hmrfmix_concatenate_pipeline(
+                outdir,
+                prefix,
+                single_X,
+                lengths,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                single_tumor_prop,
+                initial_clone_index,
+                n_states=config["n_states"],
+                log_sitewise_transmat=log_sitewise_transmat,
+                smooth_mat=smooth_mat,
+                adjacency_mat=adjacency_mat,
+                sample_ids=sample_ids,
+                max_iter_outer=config["max_iter_outer"],
+                nodepotential=config["nodepotential"],
+                hmmclass=hmm_nophasing_v2,
+                params="sp",
+                t=config["t"],
+                random_state=config["gmm_random_state"],
+                fix_NB_dispersion=config["fix_NB_dispersion"],
+                shared_NB_dispersion=config["shared_NB_dispersion"],
+                fix_BB_dispersion=config["fix_BB_dispersion"],
+                shared_BB_dispersion=config["shared_BB_dispersion"],
+                is_diag=True,
+                max_iter=config["max_iter"],
+                tol=config["tol"],
+                spatial_weight=config["spatial_weight"],
+                tumorprop_threshold=config["tumorprop_threshold"],
+            )
+
         # merge by thresholding BAF profile similarity
-        res = load_hmrf_last_iteration(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz")
+        res = load_hmrf_last_iteration(
+            f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz"
+        )
         n_obs = single_X.shape[0]
         if config["tumorprop_file"] is None:
-            X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))])
+            X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                [
+                    np.where(res["new_assignment"] == c)[0]
+                    for c in np.sort(np.unique(res["new_assignment"]))
+                ],
+            )
             tumor_prop = None
         else:
-            X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))], single_tumor_prop, threshold=config["tumorprop_threshold"])
-            tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1)
-        merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="sp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2)
+            X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                [
+                    np.where(res["new_assignment"] == c)[0]
+                    for c in np.sort(np.unique(res["new_assignment"]))
+                ],
+                single_tumor_prop,
+                threshold=config["tumorprop_threshold"],
+            )
+            tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+        merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(
+            X,
+            base_nb_mean,
+            total_bb_RD,
+            res,
+            threshold=config["np_threshold"],
+            minlength=config["np_eventminlen"],
+            params="sp",
+            tumor_prop=tumor_prop,
+            hmmclass=hmm_nophasing_v2,
+        )
         print(f"BAF clone merging after comparing similarity: {merging_groups}")
         #
         if config["tumorprop_file"] is None:
-            merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs)
+            merging_groups, merged_res = merge_by_minspots(
+                merged_res["new_assignment"],
+                merged_res,
+                single_total_bb_RD,
+                min_spots_thresholds=config["min_spots_per_clone"],
+                min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs,
+            )
         else:
-            merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop)
+            merging_groups, merged_res = merge_by_minspots(
+                merged_res["new_assignment"],
+                merged_res,
+                single_total_bb_RD,
+                min_spots_thresholds=config["min_spots_per_clone"],
+                min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs,
+                single_tumor_prop=single_tumor_prop,
+            )
         print(f"BAF clone merging after requiring minimum # spots: {merging_groups}")
         n_baf_clones = len(merging_groups)
-        np.savez(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res)
+        np.savez(
+            f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res
+        )
 
         # adjust phasing
         n_obs = single_X.shape[0]
-        merged_res = dict(np.load(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", allow_pickle=True))
+        merged_res = dict(
+            np.load(
+                f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz",
+                allow_pickle=True,
+            )
+        )
         merged_baf_assignment = copy.copy(merged_res["new_assignment"])
         n_baf_clones = len(np.unique(merged_baf_assignment))
         pred = np.argmax(merged_res["log_gamma"], axis=0)
-        pred = np.array([ pred[(c*n_obs):(c*n_obs+n_obs)] for c in range(n_baf_clones) ])
-        merged_baf_profiles = np.array([ np.where(pred[c,:] < config["n_states"], merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0], 1-merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0]) \
-                                        for c in range(n_baf_clones) ])
+        pred = np.array(
+            [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)]
+        )
+        merged_baf_profiles = np.array(
+            [
+                np.where(
+                    pred[c, :] < config["n_states"],
+                    merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0],
+                    1 - merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0],
+                )
+                for c in range(n_baf_clones)
+            ]
+        )
         # EPS_BAF = 0.05
         # merged_baf_profiles[np.abs(merged_baf_profiles - 0.5) < EPS_BAF] = 0.5
         # population_baf = np.mean(merged_baf_profiles[merged_res["new_assignment"], :], axis=0) if config["tumorprop_file"] is None else np.mean(merged_baf_profiles[merged_res["new_assignment"][single_tumor_prop > config["tumorprop_threshold"]], :], axis=0)
@@ -171,199 +357,607 @@ def main(configuration_file):
         # adding RDR information
         if not config["bafonly"]:
             # select normal spots
-            if (config["normalidx_file"] is None) and (config["tumorprop_file"] is None):
+            if (config["normalidx_file"] is None) and (
+                config["tumorprop_file"] is None
+            ):
                 EPS_BAF = 0.05
                 PERCENT_NORMAL = 40
-                vec_stds = np.std(np.log1p(copy_single_X_rdr), axis=0) # TBD: whether to smooth by multiplying smooth_mat
-                id_nearnormal_clone = np.argmin(np.sum( np.maximum(np.abs(merged_baf_profiles - 0.5)-EPS_BAF, 0), axis=1))
+                vec_stds = np.std(
+                    np.log1p(copy_single_X_rdr), axis=0
+                )  # TBD: whether to smooth by multiplying smooth_mat
+                id_nearnormal_clone = np.argmin(
+                    np.sum(
+                        np.maximum(np.abs(merged_baf_profiles - 0.5) - EPS_BAF, 0),
+                        axis=1,
+                    )
+                )
                 while True:
-                    stdthreshold = np.percentile(vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], PERCENT_NORMAL)
-                    normal_candidate = (vec_stds < stdthreshold) & (merged_res["new_assignment"] == id_nearnormal_clone)
-                    if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200 or PERCENT_NORMAL == 100:
+                    stdthreshold = np.percentile(
+                        vec_stds[merged_res["new_assignment"] == id_nearnormal_clone],
+                        PERCENT_NORMAL,
+                    )
+                    normal_candidate = (vec_stds < stdthreshold) & (
+                        merged_res["new_assignment"] == id_nearnormal_clone
+                    )
+                    if (
+                        np.sum(copy_single_X_rdr[:, (normal_candidate == True)])
+                        > single_X.shape[0] * 200
+                        or PERCENT_NORMAL == 100
+                    ):
                         break
                     PERCENT_NORMAL += 10
                 # copy_single_X_rdr, _ = filter_de_genes(exp_counts, x_gene_list, normal_candidate)
-                copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, x_gene_list, normal_candidate, sample_list=sample_list, sample_ids=sample_ids)
+                copy_single_X_rdr, _ = filter_de_genes_tri(
+                    exp_counts,
+                    x_gene_list,
+                    normal_candidate,
+                    sample_list=sample_list,
+                    sample_ids=sample_ids,
+                )
                 MIN_NORMAL_COUNT_PERBIN = 20
-                bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0]
-                rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1)
+                bidx_inconfident = np.where(
+                    np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1)
+                    < MIN_NORMAL_COUNT_PERBIN
+                )[0]
+                rdr_normal = np.sum(
+                    copy_single_X_rdr[:, (normal_candidate == True)], axis=1
+                )
                 rdr_normal[bidx_inconfident] = 0
                 rdr_normal = rdr_normal / np.sum(rdr_normal)
-                copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin.
-                copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1)
-                pd.Series(barcodes[normal_candidate==True].index).to_csv(f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False)
+                copy_single_X_rdr[bidx_inconfident, :] = (
+                    0  # avoid ill-defined distributions if normal has 0 count in that bin.
+                )
+                copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum(
+                    copy_single_X_rdr, axis=0
+                ).reshape(1, -1)
+                pd.Series(barcodes[normal_candidate == True].index).to_csv(
+                    f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False
+                )
                 #
                 index_normal = np.where(normal_candidate)[0]
-                sorted_chr_pos = list(zip(df_bininfo.CHR.values, df_bininfo.START.values))
-                lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, _, x_gene_list, index_remaining = bin_selection_basedon_normal(single_X, \
-                        single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos, x_gene_list, config["nu"], config["logphase_shift"], index_normal)
+                sorted_chr_pos = list(
+                    zip(df_bininfo.CHR.values, df_bininfo.START.values)
+                )
+                (
+                    lengths,
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    log_sitewise_transmat,
+                    sorted_chr_pos,
+                    _,
+                    x_gene_list,
+                    index_remaining,
+                ) = bin_selection_basedon_normal(
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    sorted_chr_pos,
+                    sorted_chr_pos,
+                    x_gene_list,
+                    config["nu"],
+                    config["logphase_shift"],
+                    index_normal,
+                )
                 assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]
                 df_bininfo = df_bininfo.iloc[index_remaining, :]
                 copy_single_X_rdr = copy_single_X_rdr[index_remaining, :]
                 copy_single_base_nb_mean = copy_single_base_nb_mean[index_remaining, :]
 
-            elif (not config["normalidx_file"] is None):
+            elif not config["normalidx_file"] is None:
                 # single_base_nb_mean has already been added in loading data step.
                 if not config["tumorprop_file"] is None:
-                    logger.warning(f"Mixed sources of information for normal spots! Using {config['normalidx_file']}")
-                
+                    logger.warning(
+                        f"Mixed sources of information for normal spots! Using {config['normalidx_file']}"
+                    )
+
             # adding back RDR signal
-            single_X[:,0,:] = copy_single_X_rdr
+            single_X[:, 0, :] = copy_single_X_rdr
             single_base_nb_mean = copy_single_base_nb_mean
             n_obs = single_X.shape[0]
 
             # save binned data
-            np.savez(f"{outdir}/binned_data.npz", lengths=lengths, single_X=single_X, single_base_nb_mean=single_base_nb_mean, single_total_bb_RD=single_total_bb_RD, log_sitewise_transmat=log_sitewise_transmat, single_tumor_prop=(None if config["tumorprop_file"] is None else single_tumor_prop))
+            np.savez(
+                f"{outdir}/binned_data.npz",
+                lengths=lengths,
+                single_X=single_X,
+                single_base_nb_mean=single_base_nb_mean,
+                single_total_bb_RD=single_total_bb_RD,
+                log_sitewise_transmat=log_sitewise_transmat,
+                single_tumor_prop=(
+                    None if config["tumorprop_file"] is None else single_tumor_prop
+                ),
+            )
 
             # run HMRF on each clone individually to further split BAF clone by RDR+BAF signal
             for bafc in range(n_baf_clones):
                 prefix = f"clone{bafc}"
                 idx_spots = np.where(merged_baf_assignment == bafc)[0]
-                if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: # put a minimum B allele read count on pseudobulk to split clones
+                if (
+                    np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20
+                ):  # put a minimum B allele read count on pseudobulk to split clones
                     continue
                 # initialize clone
                 if config["tumorprop_file"] is None:
-                    initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], min(len(idx_spots),config['n_clones_rdr']), random_state=r_hmrf_initialization)
+                    initial_clone_index = rectangle_initialize_initial_clone(
+                        coords[idx_spots],
+                        min(len(idx_spots), config["n_clones_rdr"]),
+                        random_state=r_hmrf_initialization,
+                    )
                 else:
-                    initial_clone_index = rectangle_initialize_initial_clone_mix(coords[idx_spots], min(len(idx_spots),config['n_clones_rdr']), single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization)
-                if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz").exists():
+                    initial_clone_index = rectangle_initialize_initial_clone_mix(
+                        coords[idx_spots],
+                        min(len(idx_spots), config["n_clones_rdr"]),
+                        single_tumor_prop[idx_spots],
+                        threshold=config["tumorprop_threshold"],
+                        random_state=r_hmrf_initialization,
+                    )
+                if not Path(
+                    f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz"
+                ).exists():
                     initial_assignment = np.zeros(len(idx_spots), dtype=int)
-                    for c,idx in enumerate(initial_clone_index):
+                    for c, idx in enumerate(initial_clone_index):
                         initial_assignment[idx] = c
-                    allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment}
-                    np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", **allres)
-                
+                    allres = {
+                        "barcodes": barcodes[idx_spots],
+                        "num_iterations": 0,
+                        "round-1_assignment": initial_assignment,
+                    }
+                    np.savez(
+                        f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz",
+                        **allres,
+                    )
+
                 # HMRF + HMM using RDR information
                 copy_slice_sample_ids = copy.copy(sample_ids[idx_spots])
                 if config["tumorprop_file"] is None:
-                    hmrf_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=config["n_states"], \
-                        log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \
-                        hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                        fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                        fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                        is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"])
+                    hmrf_concatenate_pipeline(
+                        outdir,
+                        prefix,
+                        single_X[:, :, idx_spots],
+                        lengths,
+                        single_base_nb_mean[:, idx_spots],
+                        single_total_bb_RD[:, idx_spots],
+                        initial_clone_index,
+                        n_states=config["n_states"],
+                        log_sitewise_transmat=log_sitewise_transmat,
+                        smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
+                        adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
+                        sample_ids=copy_slice_sample_ids,
+                        max_iter_outer=10,
+                        nodepotential=config["nodepotential"],
+                        hmmclass=hmm_nophasing_v2,
+                        params="smp",
+                        t=config["t"],
+                        random_state=config["gmm_random_state"],
+                        fix_NB_dispersion=config["fix_NB_dispersion"],
+                        shared_NB_dispersion=config["shared_NB_dispersion"],
+                        fix_BB_dispersion=config["fix_BB_dispersion"],
+                        shared_BB_dispersion=config["shared_BB_dispersion"],
+                        is_diag=True,
+                        max_iter=config["max_iter"],
+                        tol=config["tol"],
+                        spatial_weight=config["spatial_weight"],
+                    )
                 else:
-                    hmrfmix_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], single_tumor_prop[idx_spots], initial_clone_index, n_states=config["n_states"], \
-                        log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \
-                        hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                        fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-                        fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                        is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"])
+                    hmrfmix_concatenate_pipeline(
+                        outdir,
+                        prefix,
+                        single_X[:, :, idx_spots],
+                        lengths,
+                        single_base_nb_mean[:, idx_spots],
+                        single_total_bb_RD[:, idx_spots],
+                        single_tumor_prop[idx_spots],
+                        initial_clone_index,
+                        n_states=config["n_states"],
+                        log_sitewise_transmat=log_sitewise_transmat,
+                        smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
+                        adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
+                        sample_ids=copy_slice_sample_ids,
+                        max_iter_outer=10,
+                        nodepotential=config["nodepotential"],
+                        hmmclass=hmm_nophasing_v2,
+                        params="smp",
+                        t=config["t"],
+                        random_state=config["gmm_random_state"],
+                        fix_NB_dispersion=config["fix_NB_dispersion"],
+                        shared_NB_dispersion=config["shared_NB_dispersion"],
+                        fix_BB_dispersion=config["fix_BB_dispersion"],
+                        shared_BB_dispersion=config["shared_BB_dispersion"],
+                        is_diag=True,
+                        max_iter=config["max_iter"],
+                        tol=config["tol"],
+                        spatial_weight=config["spatial_weight"],
+                        tumorprop_threshold=config["tumorprop_threshold"],
+                    )
 
             ##### combine results across clones #####
-            res_combine = {"prev_assignment":-1 * np.ones(single_X.shape[2], dtype=int)}
+            res_combine = {
+                "prev_assignment": -1 * np.ones(single_X.shape[2], dtype=int)
+            }
             offset_clone = 0
             for bafc in range(n_baf_clones):
                 prefix = f"clone{bafc}"
-                if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz").exists():
+                if not Path(
+                    f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz"
+                ).exists():
                     # we skipped the BAF clone in the previous step because of low SNP-covering UMI conuts.
                     continue
-                allres = dict( np.load(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", allow_pickle=True) )
+                allres = dict(
+                    np.load(
+                        f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz",
+                        allow_pickle=True,
+                    )
+                )
                 r = allres["num_iterations"] - 1
-                res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-                    "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-                    "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-                    "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-                    "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
-                idx_spots = np.where(barcodes.isin( allres["barcodes"] ))[0]
+                res = {
+                    "new_log_mu": allres[f"round{r}_new_log_mu"],
+                    "new_alphas": allres[f"round{r}_new_alphas"],
+                    "new_p_binom": allres[f"round{r}_new_p_binom"],
+                    "new_taus": allres[f"round{r}_new_taus"],
+                    "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+                    "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+                    "log_gamma": allres[f"round{r}_log_gamma"],
+                    "pred_cnv": allres[f"round{r}_pred_cnv"],
+                    "llf": allres[f"round{r}_llf"],
+                    "total_llf": allres[f"round{r}_total_llf"],
+                    "prev_assignment": allres[f"round{r-1}_assignment"],
+                    "new_assignment": allres[f"round{r}_assignment"],
+                }
+                idx_spots = np.where(barcodes.isin(allres["barcodes"]))[0]
                 if len(np.unique(res["new_assignment"])) == 1:
                     n_merged_clones = 1
                     c = res["new_assignment"][0]
                     merged_res = copy.copy(res)
                     merged_res["new_assignment"] = np.zeros(len(idx_spots), dtype=int)
                     try:
-                        log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((2*config["n_states"], n_obs, 1))
+                        log_gamma = res["log_gamma"][
+                            :, (c * n_obs) : (c * n_obs + n_obs)
+                        ].reshape((2 * config["n_states"], n_obs, 1))
                     except:
-                        log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((config["n_states"], n_obs, 1))
-                    pred_cnv = res["pred_cnv"][ (c*n_obs):(c*n_obs+n_obs) ].reshape((-1,1))
+                        log_gamma = res["log_gamma"][
+                            :, (c * n_obs) : (c * n_obs + n_obs)
+                        ].reshape((config["n_states"], n_obs, 1))
+                    pred_cnv = res["pred_cnv"][
+                        (c * n_obs) : (c * n_obs + n_obs)
+                    ].reshape((-1, 1))
                 else:
                     if config["tumorprop_file"] is None:
-                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(config['n_clones_rdr'])])
+                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                            single_X[:, :, idx_spots],
+                            single_base_nb_mean[:, idx_spots],
+                            single_total_bb_RD[:, idx_spots],
+                            [
+                                np.where(res["new_assignment"] == c)[0]
+                                for c in range(config["n_clones_rdr"])
+                            ],
+                        )
                         tumor_prop = None
                     else:
-                        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(config['n_clones_rdr'])], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"])
-                        tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1)
-                    merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="smp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2)
+                        X, base_nb_mean, total_bb_RD, tumor_prop = (
+                            merge_pseudobulk_by_index_mix(
+                                single_X[:, :, idx_spots],
+                                single_base_nb_mean[:, idx_spots],
+                                single_total_bb_RD[:, idx_spots],
+                                [
+                                    np.where(res["new_assignment"] == c)[0]
+                                    for c in range(config["n_clones_rdr"])
+                                ],
+                                single_tumor_prop[idx_spots],
+                                threshold=config["tumorprop_threshold"],
+                            )
+                        )
+                        tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+                    merging_groups, merged_res = (
+                        similarity_components_rdrbaf_neymanpearson(
+                            X,
+                            base_nb_mean,
+                            total_bb_RD,
+                            res,
+                            threshold=config["np_threshold"],
+                            minlength=config["np_eventminlen"],
+                            params="smp",
+                            tumor_prop=tumor_prop,
+                            hmmclass=hmm_nophasing_v2,
+                        )
+                    )
                     print(f"part {bafc} merging_groups: {merging_groups}")
                     #
                     if config["tumorprop_file"] is None:
-                        merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs)
+                        merging_groups, merged_res = merge_by_minspots(
+                            merged_res["new_assignment"],
+                            merged_res,
+                            single_total_bb_RD[:, idx_spots],
+                            min_spots_thresholds=config["min_spots_per_clone"],
+                            min_umicount_thresholds=config["min_avgumi_per_clone"]
+                            * n_obs,
+                        )
                     else:
-                        merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop[idx_spots])
+                        merging_groups, merged_res = merge_by_minspots(
+                            merged_res["new_assignment"],
+                            merged_res,
+                            single_total_bb_RD[:, idx_spots],
+                            min_spots_thresholds=config["min_spots_per_clone"],
+                            min_umicount_thresholds=config["min_avgumi_per_clone"]
+                            * n_obs,
+                            single_tumor_prop=single_tumor_prop[idx_spots],
+                        )
                     # compute posterior using the newly merged pseudobulk
                     n_merged_clones = len(merging_groups)
                     tmp = copy.copy(merged_res["new_assignment"])
                     if config["tumorprop_file"] is None:
-                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)])
+                        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                            single_X[:, :, idx_spots],
+                            single_base_nb_mean[:, idx_spots],
+                            single_total_bb_RD[:, idx_spots],
+                            [
+                                np.where(merged_res["new_assignment"] == c)[0]
+                                for c in range(n_merged_clones)
+                            ],
+                        )
                         tumor_prop = None
                     else:
-                        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"])
+                        X, base_nb_mean, total_bb_RD, tumor_prop = (
+                            merge_pseudobulk_by_index_mix(
+                                single_X[:, :, idx_spots],
+                                single_base_nb_mean[:, idx_spots],
+                                single_total_bb_RD[:, idx_spots],
+                                [
+                                    np.where(merged_res["new_assignment"] == c)[0]
+                                    for c in range(n_merged_clones)
+                                ],
+                                single_tumor_prop[idx_spots],
+                                threshold=config["tumorprop_threshold"],
+                            )
+                        )
                     #
-                    merged_res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), config["n_states"], \
-                            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, \
-                            hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                            fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                            is_diag=True, init_log_mu=res["new_log_mu"], init_p_binom=res["new_p_binom"], init_alphas=res["new_alphas"], init_taus=res["new_taus"], max_iter=config["max_iter"], tol=config["tol"], lambd=np.sum(base_nb_mean,axis=1)/np.sum(base_nb_mean), sample_length=np.ones(X.shape[2],dtype=int)*X.shape[0])
+                    merged_res = pipeline_baum_welch(
+                        None,
+                        np.vstack(
+                            [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]
+                        ).T.reshape(-1, 2, 1),
+                        np.tile(lengths, X.shape[2]),
+                        config["n_states"],
+                        base_nb_mean.flatten("F").reshape(-1, 1),
+                        total_bb_RD.flatten("F").reshape(-1, 1),
+                        np.tile(log_sitewise_transmat, X.shape[2]),
+                        (
+                            np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+                            if not tumor_prop is None
+                            else None
+                        ),
+                        hmmclass=hmm_nophasing_v2,
+                        params="smp",
+                        t=config["t"],
+                        random_state=config["gmm_random_state"],
+                        fix_NB_dispersion=config["fix_NB_dispersion"],
+                        shared_NB_dispersion=config["shared_NB_dispersion"],
+                        fix_BB_dispersion=config["fix_BB_dispersion"],
+                        shared_BB_dispersion=config["shared_BB_dispersion"],
+                        is_diag=True,
+                        init_log_mu=res["new_log_mu"],
+                        init_p_binom=res["new_p_binom"],
+                        init_alphas=res["new_alphas"],
+                        init_taus=res["new_taus"],
+                        max_iter=config["max_iter"],
+                        tol=config["tol"],
+                        lambd=np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean),
+                        sample_length=np.ones(X.shape[2], dtype=int) * X.shape[0],
+                    )
                     merged_res["new_assignment"] = copy.copy(tmp)
-                    merged_res = combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, merged_res, params="smp", tumor_prop=np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, hmmclass=hmm_nophasing_v2, merge_threshold=0.1)
-                    log_gamma = np.stack([ merged_res["log_gamma"][:,(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ], axis=-1)
-                    pred_cnv = np.vstack([ merged_res["pred_cnv"][(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ]).T
-                
+                    merged_res = combine_similar_states_across_clones(
+                        X,
+                        base_nb_mean,
+                        total_bb_RD,
+                        merged_res,
+                        params="smp",
+                        tumor_prop=(
+                            np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+                            if not tumor_prop is None
+                            else None
+                        ),
+                        hmmclass=hmm_nophasing_v2,
+                        merge_threshold=0.1,
+                    )
+                    log_gamma = np.stack(
+                        [
+                            merged_res["log_gamma"][
+                                :, (c * n_obs) : (c * n_obs + n_obs)
+                            ]
+                            for c in range(n_merged_clones)
+                        ],
+                        axis=-1,
+                    )
+                    pred_cnv = np.vstack(
+                        [
+                            merged_res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)]
+                            for c in range(n_merged_clones)
+                        ]
+                    ).T
+
                 # add to res_combine
                 if len(res_combine) == 1:
-                    res_combine.update({"new_log_mu":np.hstack([ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([ merged_res["new_alphas"] ] * n_merged_clones), \
-                        "new_p_binom":np.hstack([ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([ merged_res["new_taus"] ] * n_merged_clones), \
-                        "log_gamma":log_gamma, "pred_cnv":pred_cnv})
+                    res_combine.update(
+                        {
+                            "new_log_mu": np.hstack(
+                                [merged_res["new_log_mu"]] * n_merged_clones
+                            ),
+                            "new_alphas": np.hstack(
+                                [merged_res["new_alphas"]] * n_merged_clones
+                            ),
+                            "new_p_binom": np.hstack(
+                                [merged_res["new_p_binom"]] * n_merged_clones
+                            ),
+                            "new_taus": np.hstack(
+                                [merged_res["new_taus"]] * n_merged_clones
+                            ),
+                            "log_gamma": log_gamma,
+                            "pred_cnv": pred_cnv,
+                        }
+                    )
                 else:
-                    res_combine.update({"new_log_mu":np.hstack([res_combine["new_log_mu"]] + [ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([res_combine["new_alphas"]] + [ merged_res["new_alphas"] ] * n_merged_clones), \
-                        "new_p_binom":np.hstack([res_combine["new_p_binom"]] + [ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([res_combine["new_taus"]] + [ merged_res["new_taus"] ] * n_merged_clones), \
-                        "log_gamma":np.dstack([res_combine["log_gamma"], log_gamma ]), "pred_cnv":np.hstack([res_combine["pred_cnv"], pred_cnv])})
-                res_combine["prev_assignment"][idx_spots] = merged_res["new_assignment"] + offset_clone
+                    res_combine.update(
+                        {
+                            "new_log_mu": np.hstack(
+                                [res_combine["new_log_mu"]]
+                                + [merged_res["new_log_mu"]] * n_merged_clones
+                            ),
+                            "new_alphas": np.hstack(
+                                [res_combine["new_alphas"]]
+                                + [merged_res["new_alphas"]] * n_merged_clones
+                            ),
+                            "new_p_binom": np.hstack(
+                                [res_combine["new_p_binom"]]
+                                + [merged_res["new_p_binom"]] * n_merged_clones
+                            ),
+                            "new_taus": np.hstack(
+                                [res_combine["new_taus"]]
+                                + [merged_res["new_taus"]] * n_merged_clones
+                            ),
+                            "log_gamma": np.dstack(
+                                [res_combine["log_gamma"], log_gamma]
+                            ),
+                            "pred_cnv": np.hstack([res_combine["pred_cnv"], pred_cnv]),
+                        }
+                    )
+                res_combine["prev_assignment"][idx_spots] = (
+                    merged_res["new_assignment"] + offset_clone
+                )
                 offset_clone += n_merged_clones
             # assign un-assigned spots to the clone with smallest number of spots
             unassigned_spots = np.where(res_combine["prev_assignment"] == -1)[0]
-            res_combine["prev_assignment"][unassigned_spots] = np.argmin(np.bincount(res_combine["prev_assignment"][res_combine["prev_assignment"]>=0]))
+            res_combine["prev_assignment"][unassigned_spots] = np.argmin(
+                np.bincount(
+                    res_combine["prev_assignment"][res_combine["prev_assignment"] >= 0]
+                )
+            )
             # temp: make dispersions the same across all clones
-            res_combine["new_alphas"][:,:] = np.max(res_combine["new_alphas"])
-            res_combine["new_taus"][:,:] = np.min(res_combine["new_taus"])
+            res_combine["new_alphas"][:, :] = np.max(res_combine["new_alphas"])
+            res_combine["new_taus"][:, :] = np.min(res_combine["new_taus"])
             # end temp
             n_final_clones = len(np.unique(res_combine["prev_assignment"]))
             # compute HMRF log likelihood
             log_persample_weights = np.zeros((n_final_clones, len(sample_list)))
             for sidx in range(len(sample_list)):
                 index = np.where(sample_ids == sidx)[0]
-                this_persample_weight = np.bincount(res_combine["prev_assignment"][index], minlength=n_final_clones) / len(index)
-                log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
-                log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
+                this_persample_weight = np.bincount(
+                    res_combine["prev_assignment"][index], minlength=n_final_clones
+                ) / len(index)
+                log_persample_weights[:, sidx] = np.where(
+                    this_persample_weight > 0, np.log(this_persample_weight), -50
+                )
+                log_persample_weights[:, sidx] = log_persample_weights[
+                    :, sidx
+                ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
             # final re-assignment across all clones using estimated RDR + BAF
             if config["tumorprop_file"] is None:
                 if config["nodepotential"] == "max":
-                    pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T
-                    new_assignment, single_llf, total_llf, posterior = aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, pred, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    pred = np.vstack(
+                        [
+                            np.argmax(res_combine["log_gamma"][:, :, c], axis=0)
+                            for c in range(res_combine["log_gamma"].shape[2])
+                        ]
+                    ).T
+                    new_assignment, single_llf, total_llf, posterior = (
+                        aggr_hmrf_reassignment(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            res_combine,
+                            pred,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
                 elif config["nodepotential"] == "weighted_sum":
-                    new_assignment, single_llf, total_llf, posterior = hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    new_assignment, single_llf, total_llf, posterior = (
+                        hmrf_reassignment_posterior(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            res_combine,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
             else:
                 if config["nodepotential"] == "max":
-                    pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T
-                    new_assignment, single_llf, total_llf, posterior = aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, pred, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    pred = np.vstack(
+                        [
+                            np.argmax(res_combine["log_gamma"][:, :, c], axis=0)
+                            for c in range(res_combine["log_gamma"].shape[2])
+                        ]
+                    ).T
+                    new_assignment, single_llf, total_llf, posterior = (
+                        aggr_hmrfmix_reassignment(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            single_tumor_prop,
+                            res_combine,
+                            pred,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
                 elif config["nodepotential"] == "weighted_sum":
-                    new_assignment, single_llf, total_llf, posterior = hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, \
-                        smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True)
+                    new_assignment, single_llf, total_llf, posterior = (
+                        hmrfmix_reassignment_posterior(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            single_tumor_prop,
+                            res_combine,
+                            smooth_mat,
+                            adjacency_mat,
+                            res_combine["prev_assignment"],
+                            copy.copy(sample_ids),
+                            log_persample_weights,
+                            spatial_weight=config["spatial_weight"],
+                            hmmclass=hmm_nophasing_v2,
+                            return_posterior=True,
+                        )
+                    )
             res_combine["total_llf"] = total_llf
             res_combine["new_assignment"] = new_assignment
             # res_combine = dict(np.load(f"{outdir}/original_rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True))
             # posterior = np.load(f"{outdir}/original_posterior_clone_probability.npy")
             # re-order clones such that normal clones are always clone 0
-            res_combine, posterior = reorder_results(res_combine, posterior, single_tumor_prop)
+            res_combine, posterior = reorder_results(
+                res_combine, posterior, single_tumor_prop
+            )
             # save results
-            np.savez(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine)
+            np.savez(
+                f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+                **res_combine,
+            )
             np.save(f"{outdir}/posterior_clone_probability.npy", posterior)
-            
+
             ##### infer integer copy #####
-            res_combine = dict(np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True))
+            res_combine = dict(
+                np.load(
+                    f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+                    allow_pickle=True,
+                )
+            )
             final_clone_ids = np.sort(np.unique(res_combine["new_assignment"]))
             nonempty_clone_ids = copy.copy(final_clone_ids)
             # add clone 0 as normal clone if it doesn't appear in final_clone_ids
@@ -371,7 +965,7 @@ def main(configuration_file):
                 final_clone_ids = np.append(0, final_clone_ids)
             # chr position
             medfix = ["", "_diploid", "_triploid", "_tetraploid"]
-            for o,max_medploidy in enumerate([None, 2, 3, 4]):
+            for o, max_medploidy in enumerate([None, 2, 3, 4]):
                 # A/B copy number per bin
                 allele_specific_copy = []
                 # A/B copy number per state
@@ -379,41 +973,139 @@ def main(configuration_file):
 
                 df_genelevel_cnv = None
                 if config["tumorprop_file"] is None:
-                    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids])
+                    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                        single_X,
+                        single_base_nb_mean,
+                        single_total_bb_RD,
+                        [
+                            np.where(res_combine["new_assignment"] == cid)[0]
+                            for cid in final_clone_ids
+                        ],
+                    )
                 else:
-                    X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids], single_tumor_prop, threshold=config["tumorprop_threshold"])
+                    X, base_nb_mean, total_bb_RD, tumor_prop = (
+                        merge_pseudobulk_by_index_mix(
+                            single_X,
+                            single_base_nb_mean,
+                            single_total_bb_RD,
+                            [
+                                np.where(res_combine["new_assignment"] == cid)[0]
+                                for cid in final_clone_ids
+                            ],
+                            single_tumor_prop,
+                            threshold=config["tumorprop_threshold"],
+                        )
+                    )
 
-                finding_distate_failed=False
+                finding_distate_failed = False
                 for s, cid in enumerate(final_clone_ids):
-                    if np.sum(base_nb_mean[:,s]) == 0:
+                    if np.sum(base_nb_mean[:, s]) == 0:
                         continue
                     # adjust log_mu such that sum_bin lambda * np.exp(log_mu) = 1
-                    lambd = base_nb_mean[:,s] / np.sum(base_nb_mean[:,s])
-                    this_pred_cnv = res_combine["pred_cnv"][:,s]
-                    adjusted_log_mu = np.log( np.exp(res_combine["new_log_mu"][:,s]) / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv,s]) * lambd) )
+                    lambd = base_nb_mean[:, s] / np.sum(base_nb_mean[:, s])
+                    this_pred_cnv = res_combine["pred_cnv"][:, s]
+                    adjusted_log_mu = np.log(
+                        np.exp(res_combine["new_log_mu"][:, s])
+                        / np.sum(
+                            np.exp(res_combine["new_log_mu"][this_pred_cnv, s]) * lambd
+                        )
+                    )
                     if not max_medploidy is None:
-                        best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, max_medploidy=max_medploidy)
+                        best_integer_copies, _ = (
+                            hill_climbing_integer_copynumber_oneclone(
+                                adjusted_log_mu,
+                                base_nb_mean[:, s],
+                                res_combine["new_p_binom"][:, s],
+                                this_pred_cnv,
+                                max_medploidy=max_medploidy,
+                            )
+                        )
                     else:
                         try:
-                            best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"])
+                            best_integer_copies, _ = (
+                                hill_climbing_integer_copynumber_fixdiploid(
+                                    adjusted_log_mu,
+                                    base_nb_mean[:, s],
+                                    res_combine["new_p_binom"][:, s],
+                                    this_pred_cnv,
+                                    nonbalance_bafdist=config["nonbalance_bafdist"],
+                                    nondiploid_rdrdist=config["nondiploid_rdrdist"],
+                                )
+                            )
                         except:
                             try:
-                                best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"], min_prop_threshold=0.05)
+                                best_integer_copies, _ = (
+                                    hill_climbing_integer_copynumber_fixdiploid(
+                                        adjusted_log_mu,
+                                        base_nb_mean[:, s],
+                                        res_combine["new_p_binom"][:, s],
+                                        this_pred_cnv,
+                                        nonbalance_bafdist=config["nonbalance_bafdist"],
+                                        nondiploid_rdrdist=config["nondiploid_rdrdist"],
+                                        min_prop_threshold=0.05,
+                                    )
+                                )
                             except:
                                 finding_distate_failed = True
                                 continue
 
-                    print(f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}")
-                    
-                    allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 0].reshape(1,-1), index=[f"clone{cid} A"], columns=np.arange(n_obs) ) )
-                    allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 1].reshape(1,-1), index=[f"clone{cid} B"], columns=np.arange(n_obs) ) )
+                    print(
+                        f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}"
+                    )
+
+                    allele_specific_copy.append(
+                        pd.DataFrame(
+                            best_integer_copies[
+                                res_combine["pred_cnv"][:, s], 0
+                            ].reshape(1, -1),
+                            index=[f"clone{cid} A"],
+                            columns=np.arange(n_obs),
+                        )
+                    )
+                    allele_specific_copy.append(
+                        pd.DataFrame(
+                            best_integer_copies[
+                                res_combine["pred_cnv"][:, s], 1
+                            ].reshape(1, -1),
+                            index=[f"clone{cid} B"],
+                            columns=np.arange(n_obs),
+                        )
+                    )
                     #
-                    state_cnv.append( pd.DataFrame( res_combine["new_log_mu"][:,s].reshape(-1,1), columns=[f"clone{cid} logmu"], index=np.arange(config['n_states']) ) )
-                    state_cnv.append( pd.DataFrame( res_combine["new_p_binom"][:,s].reshape(-1,1), columns=[f"clone{cid} p"], index=np.arange(config['n_states']) ) )
-                    state_cnv.append( pd.DataFrame( best_integer_copies[:,0].reshape(-1,1), columns=[f"clone{cid} A"], index=np.arange(config['n_states']) ) )
-                    state_cnv.append( pd.DataFrame( best_integer_copies[:,1].reshape(-1,1), columns=[f"clone{cid} B"], index=np.arange(config['n_states']) ) )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            res_combine["new_log_mu"][:, s].reshape(-1, 1),
+                            columns=[f"clone{cid} logmu"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            res_combine["new_p_binom"][:, s].reshape(-1, 1),
+                            columns=[f"clone{cid} p"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            best_integer_copies[:, 0].reshape(-1, 1),
+                            columns=[f"clone{cid} A"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
+                    state_cnv.append(
+                        pd.DataFrame(
+                            best_integer_copies[:, 1].reshape(-1, 1),
+                            columns=[f"clone{cid} B"],
+                            index=np.arange(config["n_states"]),
+                        )
+                    )
                     #
-                    tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list)
+                    tmpdf = get_genelevel_cnv_oneclone(
+                        best_integer_copies[res_combine["pred_cnv"][:, s], 0],
+                        best_integer_copies[res_combine["pred_cnv"][:, s], 1],
+                        x_gene_list,
+                    )
                     tmpdf.columns = [f"clone{s} A", f"clone{s} B"]
                     if df_genelevel_cnv is None:
                         df_genelevel_cnv = copy.copy(tmpdf)
@@ -424,24 +1116,62 @@ def main(configuration_file):
                     continue
 
                 # output gene-level copy number
-                df_genelevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t")
+                df_genelevel_cnv.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_genelevel.tsv",
+                    header=True,
+                    index=True,
+                    sep="\t",
+                )
                 # output segment-level copy number
                 allele_specific_copy = pd.concat(allele_specific_copy)
-                df_seglevel_cnv = pd.DataFrame({"CHR":df_bininfo.CHR.values, "START":df_bininfo.START.values, "END":df_bininfo.END.values })
-                df_seglevel_cnv = df_seglevel_cnv.join( allele_specific_copy.T )
-                df_seglevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t")
+                df_seglevel_cnv = pd.DataFrame(
+                    {
+                        "CHR": df_bininfo.CHR.values,
+                        "START": df_bininfo.START.values,
+                        "END": df_bininfo.END.values,
+                    }
+                )
+                df_seglevel_cnv = df_seglevel_cnv.join(allele_specific_copy.T)
+                df_seglevel_cnv.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_seglevel.tsv",
+                    header=True,
+                    index=False,
+                    sep="\t",
+                )
                 # output per-state copy number
-                state_cnv = functools.reduce(lambda left,right: pd.merge(left,right, left_index=True, right_index=True, how='inner'), state_cnv)
-                state_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t")
-                # summarize to cna events 
-                df_event = summary_events(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz")
-                df_event.to_csv(f"{outdir}/cnv{medfix[o]}_event.tsv", header=True, index=False, sep="\t")
-            
+                state_cnv = functools.reduce(
+                    lambda left, right: pd.merge(
+                        left, right, left_index=True, right_index=True, how="inner"
+                    ),
+                    state_cnv,
+                )
+                state_cnv.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_perstate.tsv",
+                    header=True,
+                    index=False,
+                    sep="\t",
+                )
+                # summarize to cna events
+                df_event = summary_events(
+                    f"{outdir}/cnv{medfix[o]}_seglevel.tsv",
+                    f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+                )
+                df_event.to_csv(
+                    f"{outdir}/cnv{medfix[o]}_event.tsv",
+                    header=True,
+                    index=False,
+                    sep="\t",
+                )
+
             ##### output clone label #####
-            df_clone_label = pd.DataFrame({"clone_label":res_combine["new_assignment"]}, index=barcodes)
+            df_clone_label = pd.DataFrame(
+                {"clone_label": res_combine["new_assignment"]}, index=barcodes
+            )
             if not config["tumorprop_file"] is None:
                 df_clone_label["tumor_proportion"] = single_tumor_prop
-            df_clone_label.to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t")
+            df_clone_label.to_csv(
+                f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t"
+            )
 
             ##### plotting #####
             # make a directory for plots
@@ -450,44 +1180,143 @@ def main(configuration_file):
 
             # plot RDR and BAF
             cn_file = f"{outdir}/cnv_diploid_seglevel.tsv"
-            fig = plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=30, palette="tab10")
-            fig.savefig(f"{outdir}/plots/rdr_baf_defaultcolor.pdf", transparent=True, bbox_inches="tight")
+            fig = plot_rdr_baf(
+                configuration_file,
+                r_hmrf_initialization,
+                cn_file,
+                clone_ids=None,
+                remove_xticks=True,
+                rdr_ylim=5,
+                chrtext_shift=-0.3,
+                base_height=3.2,
+                pointsize=30,
+                palette="tab10",
+            )
+            fig.savefig(
+                f"{outdir}/plots/rdr_baf_defaultcolor.pdf",
+                transparent=True,
+                bbox_inches="tight",
+            )
             # plot allele-specific copy number
-            for o,max_medploidy in enumerate([None, 2, 3, 4]):
+            for o, max_medploidy in enumerate([None, 2, 3, 4]):
                 cn_file = f"{outdir}/cnv{medfix[o]}_seglevel.tsv"
                 if not Path(cn_file).exists():
                     continue
                 df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
                 df_cnv = expand_df_cnv(df_cnv)
-                fig, axes = plt.subplots(1, 1, figsize=(15, 0.9*len(final_clone_ids) + 0.6), dpi=200, facecolor="white")
-                axes = plot_acn_from_df(df_cnv, axes, add_chrbar=True, add_arrow=True, chrbar_thickness=0.4/(0.6*len(final_clone_ids) + 0.4), add_legend=True, remove_xticks=True)
+                fig, axes = plt.subplots(
+                    1,
+                    1,
+                    figsize=(15, 0.9 * len(final_clone_ids) + 0.6),
+                    dpi=200,
+                    facecolor="white",
+                )
+                axes = plot_acn_from_df(
+                    df_cnv,
+                    axes,
+                    add_chrbar=True,
+                    add_arrow=True,
+                    chrbar_thickness=0.4 / (0.6 * len(final_clone_ids) + 0.4),
+                    add_legend=True,
+                    remove_xticks=True,
+                )
                 fig.tight_layout()
-                fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}.pdf", transparent=True, bbox_inches="tight")
+                fig.savefig(
+                    f"{outdir}/plots/acn_genome{medfix[o]}.pdf",
+                    transparent=True,
+                    bbox_inches="tight",
+                )
                 # additionally plot the allele-specific copy number per region
                 if not config["supervision_clone_file"] is None:
-                    fig, axes = plt.subplots(1, 1, figsize=(15, 0.6*len(unique_clone_ids) + 0.4), dpi=200, facecolor="white")
+                    fig, axes = plt.subplots(
+                        1,
+                        1,
+                        figsize=(15, 0.6 * len(unique_clone_ids) + 0.4),
+                        dpi=200,
+                        facecolor="white",
+                    )
                     merged_df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
                     df_cnv = merged_df_cnv[["CHR", "START", "END"]]
-                    df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} A":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} A"] for i,x in enumerate(unique_clone_ids)}) )
-                    df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} B":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} B"] for i,x in enumerate(unique_clone_ids)}) )
+                    df_cnv = df_cnv.join(
+                        pd.DataFrame(
+                            {
+                                f"clone{x} A": merged_df_cnv[
+                                    f"clone{res_combine['new_assignment'][i]} A"
+                                ]
+                                for i, x in enumerate(unique_clone_ids)
+                            }
+                        )
+                    )
+                    df_cnv = df_cnv.join(
+                        pd.DataFrame(
+                            {
+                                f"clone{x} B": merged_df_cnv[
+                                    f"clone{res_combine['new_assignment'][i]} B"
+                                ]
+                                for i, x in enumerate(unique_clone_ids)
+                            }
+                        )
+                    )
                     df_cnv = expand_df_cnv(df_cnv)
-                    clone_ids = np.concatenate([ unique_clone_ids[res_combine["new_assignment"]==c].astype(str) for c in final_clone_ids ])
-                    axes = plot_acn_from_df(df_cnv, axes, clone_ids=clone_ids, clone_names=[f"region {x}" for x in clone_ids], add_chrbar=True, add_arrow=False, chrbar_thickness=0.4/(0.6*len(unique_clone_ids) + 0.4), add_legend=True, remove_xticks=True)
+                    clone_ids = np.concatenate(
+                        [
+                            unique_clone_ids[res_combine["new_assignment"] == c].astype(
+                                str
+                            )
+                            for c in final_clone_ids
+                        ]
+                    )
+                    axes = plot_acn_from_df(
+                        df_cnv,
+                        axes,
+                        clone_ids=clone_ids,
+                        clone_names=[f"region {x}" for x in clone_ids],
+                        add_chrbar=True,
+                        add_arrow=False,
+                        chrbar_thickness=0.4 / (0.6 * len(unique_clone_ids) + 0.4),
+                        add_legend=True,
+                        remove_xticks=True,
+                    )
                     fig.tight_layout()
-                    fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf", transparent=True, bbox_inches="tight")
+                    fig.savefig(
+                        f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf",
+                        transparent=True,
+                        bbox_inches="tight",
+                    )
             # plot clones in space
             if not config["supervision_clone_file"] is None:
                 before_assignments = pd.Series([None] * before_coords.shape[0])
-                for i,c in enumerate(unique_clone_ids):
-                    before_assignments.iloc[before_df_clones.clone_id.isin([c])] = f"clone {res_combine['new_assignment'][i]}"
-                fig = plot_clones_in_space(before_coords, before_assignments, sample_list, before_sample_ids, palette="Set2", labels=unique_clone_ids, label_coords=coords, label_sample_ids=sample_ids)
-                fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight")
+                for i, c in enumerate(unique_clone_ids):
+                    before_assignments.iloc[before_df_clones.clone_id.isin([c])] = (
+                        f"clone {res_combine['new_assignment'][i]}"
+                    )
+                fig = plot_clones_in_space(
+                    before_coords,
+                    before_assignments,
+                    sample_list,
+                    before_sample_ids,
+                    palette="Set2",
+                    labels=unique_clone_ids,
+                    label_coords=coords,
+                    label_sample_ids=sample_ids,
+                )
+                fig.savefig(
+                    f"{outdir}/plots/clone_spatial.pdf",
+                    transparent=True,
+                    bbox_inches="tight",
+                )
             else:
-                assignment = pd.Series([f"clone {x}" for x in res_combine["new_assignment"]])
+                assignment = pd.Series(
+                    [f"clone {x}" for x in res_combine["new_assignment"]]
+                )
                 fig = plot_clones_in_space(coords, assignment, axes, palette="Set2")
-                fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight")
+                fig.savefig(
+                    f"{outdir}/plots/clone_spatial.pdf",
+                    transparent=True,
+                    bbox_inches="tight",
+                )
 
 
 if __name__ == "__main__":
     if len(sys.argv) > 1:
-        main(sys.argv[1])
\ No newline at end of file
+        main(sys.argv[1])
diff --git a/src/calicost/estimate_tumor_proportion.py b/src/calicost/estimate_tumor_proportion.py
index 06d4caa..e61a795 100644
--- a/src/calicost/estimate_tumor_proportion.py
+++ b/src/calicost/estimate_tumor_proportion.py
@@ -4,7 +4,12 @@
 import pandas as pd
 from pathlib import Path
 import logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 logger = logging.getLogger()
 import copy
 import functools
@@ -22,99 +27,234 @@ def main(configuration_file):
     except:
         config = read_joint_configuration_file(configuration_file)
 
-    lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
-        barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config)
-    
-    single_base_nb_mean[:,:] = 0
+    (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+        df_bininfo,
+        df_gene_snp,
+        barcodes,
+        coords,
+        single_tumor_prop,
+        sample_list,
+        sample_ids,
+        adjacency_mat,
+        smooth_mat,
+        exp_counts,
+    ) = run_parse_n_load(config)
+
+    single_base_nb_mean[:, :] = 0
 
     n_states_for_tumorprop = 5
     n_clones_for_tumorprop = 3
-    n_rdrclones_for_tumorprop = 3 #2
+    n_rdrclones_for_tumorprop = 3  # 2
     max_outer_iter_for_tumorprop = 10
     max_iter_for_tumorprop = 20
     MIN_PROP_UNCERTAINTY = 0.05
-    initial_clone_index = rectangle_initialize_initial_clone(coords, n_clones_for_tumorprop, random_state=0)
+    initial_clone_index = rectangle_initialize_initial_clone(
+        coords, n_clones_for_tumorprop, random_state=0
+    )
     # save clone initialization into npz file
     prefix = "initialhmm"
-    if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists():
+    if not Path(
+        f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz"
+    ).exists():
         initial_assignment = np.zeros(single_X.shape[2], dtype=int)
-        for c,idx in enumerate(initial_clone_index):
+        for c, idx in enumerate(initial_clone_index):
             initial_assignment[idx] = c
-        allres = {"num_iterations":0, "round-1_assignment":initial_assignment}
-        np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres)
+        allres = {"num_iterations": 0, "round-1_assignment": initial_assignment}
+        np.savez(
+            f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz",
+            **allres,
+        )
+
+    hmrf_concatenate_pipeline(
+        config["output_dir"],
+        prefix,
+        single_X,
+        lengths,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        initial_clone_index,
+        n_states=n_states_for_tumorprop,
+        log_sitewise_transmat=log_sitewise_transmat,
+        smooth_mat=smooth_mat,
+        adjacency_mat=adjacency_mat,
+        sample_ids=sample_ids,
+        max_iter_outer=max_outer_iter_for_tumorprop,
+        nodepotential=config["nodepotential"],
+        hmmclass=hmm_nophasing_v2,
+        params="sp",
+        t=config["t"],
+        random_state=config["gmm_random_state"],
+        fix_NB_dispersion=config["fix_NB_dispersion"],
+        shared_NB_dispersion=config["shared_NB_dispersion"],
+        fix_BB_dispersion=config["fix_BB_dispersion"],
+        shared_BB_dispersion=config["shared_BB_dispersion"],
+        is_diag=True,
+        max_iter=max_iter_for_tumorprop,
+        tol=config["tol"],
+        spatial_weight=config["spatial_weight"],
+    )
 
-    hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=n_states_for_tumorprop, \
-            log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=max_outer_iter_for_tumorprop, nodepotential=config["nodepotential"], \
-            hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
-            fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-            fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-            is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"])
-    
-    res = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz")
-    merging_groups, merged_res = merge_by_minspots(res["new_assignment"], res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*single_X.shape[0])
+    res = load_hmrf_last_iteration(
+        f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz"
+    )
+    merging_groups, merged_res = merge_by_minspots(
+        res["new_assignment"],
+        res,
+        single_total_bb_RD,
+        min_spots_thresholds=config["min_spots_per_clone"],
+        min_umicount_thresholds=config["min_avgumi_per_clone"] * single_X.shape[0],
+    )
 
     # further refine clones
-    combined_assignment = copy.copy(merged_res['new_assignment'])
+    combined_assignment = copy.copy(merged_res["new_assignment"])
     offset_clone = 0
     combined_p_binom = []
     offset_state = 0
     combined_pred_cnv = []
     for bafc in range(len(merging_groups)):
         prefix = f"initialhmm_clone{bafc}"
-        idx_spots = np.where(merged_res['new_assignment'] == bafc)[0]
+        idx_spots = np.where(merged_res["new_assignment"] == bafc)[0]
         total_allele_count = np.sum(single_total_bb_RD[:, idx_spots])
-        if total_allele_count < single_X.shape[0] * 50: # put a minimum B allele read count on pseudobulk to split clones
+        if (
+            total_allele_count < single_X.shape[0] * 50
+        ):  # put a minimum B allele read count on pseudobulk to split clones
             combined_assignment[idx_spots] = offset_clone
             offset_clone += 1
-            combined_p_binom.append(merged_res['new_p_binom'])
-            combined_pred_cnv.append(merged_res['pred_cnv'] + offset_state)
-            offset_state += merged_res['new_p_binom'].shape[0]
+            combined_p_binom.append(merged_res["new_p_binom"])
+            combined_pred_cnv.append(merged_res["pred_cnv"] + offset_state)
+            offset_state += merged_res["new_p_binom"].shape[0]
             continue
         # initialize clone
-        initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], n_rdrclones_for_tumorprop, random_state=0)
+        initial_clone_index = rectangle_initialize_initial_clone(
+            coords[idx_spots], n_rdrclones_for_tumorprop, random_state=0
+        )
         # save clone initialization into npz file
-        if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists():
+        if not Path(
+            f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz"
+        ).exists():
             initial_assignment = np.zeros(len(idx_spots), dtype=int)
-            for c,idx in enumerate(initial_clone_index):
+            for c, idx in enumerate(initial_clone_index):
                 initial_assignment[idx] = c
-            allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment}
-            np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres)
-        
+            allres = {
+                "barcodes": barcodes[idx_spots],
+                "num_iterations": 0,
+                "round-1_assignment": initial_assignment,
+            }
+            np.savez(
+                f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz",
+                **allres,
+            )
+
         copy_slice_sample_ids = copy.copy(sample_ids[idx_spots])
-        hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=n_states_for_tumorprop, \
-            log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \
-            hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
-            fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
-            fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-            is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"])
-    
-        cloneres = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz")
-        combined_assignment[idx_spots] = cloneres['new_assignment'] + offset_clone
-        offset_clone += np.max(cloneres['new_assignment']) + 1
-        combined_p_binom.append(cloneres['new_p_binom'])
-        combined_pred_cnv.append(cloneres['pred_cnv'] + offset_state)
-        offset_state += cloneres['new_p_binom'].shape[0]
+        hmrf_concatenate_pipeline(
+            config["output_dir"],
+            prefix,
+            single_X[:, :, idx_spots],
+            lengths,
+            single_base_nb_mean[:, idx_spots],
+            single_total_bb_RD[:, idx_spots],
+            initial_clone_index,
+            n_states=n_states_for_tumorprop,
+            log_sitewise_transmat=log_sitewise_transmat,
+            smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
+            adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
+            sample_ids=copy_slice_sample_ids,
+            max_iter_outer=10,
+            nodepotential=config["nodepotential"],
+            hmmclass=hmm_nophasing_v2,
+            params="sp",
+            t=config["t"],
+            random_state=config["gmm_random_state"],
+            fix_NB_dispersion=config["fix_NB_dispersion"],
+            shared_NB_dispersion=config["shared_NB_dispersion"],
+            fix_BB_dispersion=config["fix_BB_dispersion"],
+            shared_BB_dispersion=config["shared_BB_dispersion"],
+            is_diag=True,
+            max_iter=max_iter_for_tumorprop,
+            tol=config["tol"],
+            spatial_weight=config["spatial_weight"],
+        )
+
+        cloneres = load_hmrf_last_iteration(
+            f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz"
+        )
+        combined_assignment[idx_spots] = cloneres["new_assignment"] + offset_clone
+        offset_clone += np.max(cloneres["new_assignment"]) + 1
+        combined_p_binom.append(cloneres["new_p_binom"])
+        combined_pred_cnv.append(cloneres["pred_cnv"] + offset_state)
+        offset_state += cloneres["new_p_binom"].shape[0]
     combined_p_binom = np.vstack(combined_p_binom)
     combined_pred_cnv = np.concatenate(combined_pred_cnv)
 
-    normal_candidate = identify_normal_spots(single_X, single_total_bb_RD, merged_res['new_assignment'], merged_res['pred_cnv'], merged_res['new_p_binom'], min_count=single_X.shape[0] * 200)
-    loh_states, is_B_lost, rdr_values, clones_hightumor = identify_loh_per_clone(single_X, combined_assignment, combined_pred_cnv, combined_p_binom, normal_candidate, single_total_bb_RD)
-    assignments = pd.DataFrame({'coarse':merged_res['new_assignment'], 'combined':combined_assignment})
+    normal_candidate = identify_normal_spots(
+        single_X,
+        single_total_bb_RD,
+        merged_res["new_assignment"],
+        merged_res["pred_cnv"],
+        merged_res["new_p_binom"],
+        min_count=single_X.shape[0] * 200,
+    )
+    loh_states, is_B_lost, rdr_values, clones_hightumor = identify_loh_per_clone(
+        single_X,
+        combined_assignment,
+        combined_pred_cnv,
+        combined_p_binom,
+        normal_candidate,
+        single_total_bb_RD,
+    )
+    assignments = pd.DataFrame(
+        {"coarse": merged_res["new_assignment"], "combined": combined_assignment}
+    )
     # pool across adjacency spot to increase the UMIs covering LOH region
-    _, tp_smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, 
-                                            across_slice_adjacency_mat=None, construct_adjacency_method=config['construct_adjacency_method'], 
-                                            maxspots_pooling=7, construct_adjacency_w=config['construct_adjacency_w'])
-    single_tumor_prop, _ = estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, combined_pred_cnv, loh_states, is_B_lost, rdr_values, clones_hightumor, smooth_mat=tp_smooth_mat)
+    _, tp_smooth_mat = multislice_adjacency(
+        sample_ids,
+        sample_list,
+        coords,
+        single_total_bb_RD,
+        exp_counts,
+        across_slice_adjacency_mat=None,
+        construct_adjacency_method=config["construct_adjacency_method"],
+        maxspots_pooling=7,
+        construct_adjacency_w=config["construct_adjacency_w"],
+    )
+    single_tumor_prop, _ = estimator_tumor_proportion(
+        single_X,
+        single_total_bb_RD,
+        assignments,
+        combined_pred_cnv,
+        loh_states,
+        is_B_lost,
+        rdr_values,
+        clones_hightumor,
+        smooth_mat=tp_smooth_mat,
+    )
     # post-processing to remove negative tumor proportions
-    single_tumor_prop = np.where(single_tumor_prop < MIN_PROP_UNCERTAINTY, MIN_PROP_UNCERTAINTY, single_tumor_prop)
+    single_tumor_prop = np.where(
+        single_tumor_prop < MIN_PROP_UNCERTAINTY,
+        MIN_PROP_UNCERTAINTY,
+        single_tumor_prop,
+    )
     single_tumor_prop[normal_candidate] = 0
     # save single_tumor_prop to file
-    pd.DataFrame({"Tumor":single_tumor_prop}, index=barcodes).to_csv(f"{config['output_dir']}/loh_estimator_tumor_prop.tsv", header=True, sep="\t")
+    pd.DataFrame({"Tumor": single_tumor_prop}, index=barcodes).to_csv(
+        f"{config['output_dir']}/loh_estimator_tumor_prop.tsv", header=True, sep="\t"
+    )
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
+    parser.add_argument(
+        "-c",
+        "--configfile",
+        help="configuration file of CalicoST",
+        required=True,
+        type=str,
+    )
     args = parser.parse_args()
 
-    main(args.configfile)
\ No newline at end of file
+    main(args.configfile)
diff --git a/src/calicost/find_integer_copynumber.py b/src/calicost/find_integer_copynumber.py
index b1e41f6..020065b 100644
--- a/src/calicost/find_integer_copynumber.py
+++ b/src/calicost/find_integer_copynumber.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import scipy
+
 # import gurobipy as gp
 # from gurobipy import GRB
 import copy
@@ -70,23 +71,44 @@
 #     return best_integer_copies, best_obj
 
 
-def find_diploid_balanced_state(new_log_mu, new_p_binom, pred_cnv, min_prop_threshold, EPS_BAF):
+def find_diploid_balanced_state(
+    new_log_mu, new_p_binom, pred_cnv, min_prop_threshold, EPS_BAF
+):
     n_states = len(new_log_mu)
     # find candidate diploid balanced state under the criteria that (1) #bins in that state > 0.1 * total #bins and (2) BAF is close to 0.5 by EPS_BAF distance
-    candidate = np.where( (np.bincount(pred_cnv, minlength=n_states) >= min_prop_threshold*len(pred_cnv)) & (np.abs(new_p_binom - 0.5) <= EPS_BAF) )[0]
+    candidate = np.where(
+        (
+            np.bincount(pred_cnv, minlength=n_states)
+            >= min_prop_threshold * len(pred_cnv)
+        )
+        & (np.abs(new_p_binom - 0.5) <= EPS_BAF)
+    )[0]
     if len(candidate) == 0:
         raise ValueError("No candidate diploid balanced state found!")
     else:
         # the diploid balanced states is the one in candidate with smallest new_log_mu
-        return candidate[ np.argmin(new_log_mu[candidate]) ]
-
-
-def hill_climbing_integer_copynumber_fixdiploid(new_log_mu, base_nb_mean, new_p_binom, pred_cnv, max_allele_copy=5, max_total_copy=6, max_medploidy=4, \
-                                                min_prop_threshold=0.1, EPS_BAF=0.05, nonbalance_bafdist=None, nondiploid_rdrdist=None, enforce_states={}):
+        return candidate[np.argmin(new_log_mu[candidate])]
+
+
+def hill_climbing_integer_copynumber_fixdiploid(
+    new_log_mu,
+    base_nb_mean,
+    new_p_binom,
+    pred_cnv,
+    max_allele_copy=5,
+    max_total_copy=6,
+    max_medploidy=4,
+    min_prop_threshold=0.1,
+    EPS_BAF=0.05,
+    nonbalance_bafdist=None,
+    nondiploid_rdrdist=None,
+    enforce_states={},
+):
     n_states = len(new_log_mu)
     lambd = base_nb_mean / np.sum(base_nb_mean)
-    weight_per_state = np.array([ np.sum(lambd[pred_cnv == s]) for s in range(n_states)])
+    weight_per_state = np.array([np.sum(lambd[pred_cnv == s]) for s in range(n_states)])
     mu = np.exp(new_log_mu)
+
     #
     def is_nondiploidnormal(k):
         if not nonbalance_bafdist is None:
@@ -96,23 +118,37 @@ def is_nondiploidnormal(k):
             if np.abs(mu[k] - 1) > nondiploid_rdrdist:
                 return True
         return False
+
     #
     EPS_POINTS = 0.1
+
     def f(params, ploidy, scalefactor):
         # params of size (n_states, 2)
-        if np.any( np.sum(params, axis=1) == 0 ):
+        if np.any(np.sum(params, axis=1) == 0):
             return len(pred_cnv) * 1e6
         frac_rdr = np.sum(params, axis=1) / scalefactor
-        frac_baf = params[:,0] / np.sum(params, axis=1)
-        points_per_state = np.bincount(pred_cnv, minlength=params.shape[0] ) + EPS_POINTS
+        frac_baf = params[:, 0] / np.sum(params, axis=1)
+        points_per_state = np.bincount(pred_cnv, minlength=params.shape[0]) + EPS_POINTS
         ### temp penalty ###
         mu_threshold = 0.3
-        crucial_ordered_pairs_1 = (mu[:,None] - mu[None,:] > mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] < 0)
-        crucial_ordered_pairs_2 = (mu[:,None] - mu[None,:] < -mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] > 0)
+        crucial_ordered_pairs_1 = (mu[:, None] - mu[None, :] > mu_threshold) * (
+            np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] < 0
+        )
+        crucial_ordered_pairs_2 = (mu[:, None] - mu[None, :] < -mu_threshold) * (
+            np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] > 0
+        )
         # penalty on ploidy
-        derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum(points_per_state, axis=0)
-        return np.square(0.3 * (mu - frac_rdr)).dot(points_per_state) + np.square(new_p_binom - frac_baf).dot(points_per_state) + \
-            np.sum(crucial_ordered_pairs_1) * len(pred_cnv) + np.sum(crucial_ordered_pairs_2) * len(pred_cnv) + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv)
+        derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum(
+            points_per_state, axis=0
+        )
+        return (
+            np.square(0.3 * (mu - frac_rdr)).dot(points_per_state)
+            + np.square(new_p_binom - frac_baf).dot(points_per_state)
+            + np.sum(crucial_ordered_pairs_1) * len(pred_cnv)
+            + np.sum(crucial_ordered_pairs_2) * len(pred_cnv)
+            + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv)
+        )
+
     #
     def hill_climb(initial_params, ploidy, idx_diploid_normal, max_iter=10):
         scalefactor = 2.0 / mu[idx_diploid_normal]
@@ -125,35 +161,51 @@ def hill_climb(initial_params, ploidy, idx_diploid_normal, max_iter=10):
                 if k == idx_diploid_normal or k in enforce_states:
                     continue
                 this_best_obj = best_obj
-                this_best_k = copy.copy(params[k,:])
+                this_best_k = copy.copy(params[k, :])
                 for candi in candidates:
                     if is_nondiploidnormal(k) and candi[0] == 1 and candi[1] == 1:
                         continue
-                    params[k,:] = candi
+                    params[k, :] = candi
                     obj = f(params, ploidy, scalefactor)
                     if obj < this_best_obj:
                         this_best_obj = obj
                         this_best_k = candi
-                increased = (increased | (this_best_obj < best_obj))
-                params[k,:] = this_best_k
+                increased = increased | (this_best_obj < best_obj)
+                params[k, :] = this_best_k
                 best_obj = this_best_obj
             if not increased:
                 break
         return params, best_obj
+
     # diploid normal state
-    idx_diploid_normal = find_diploid_balanced_state(new_log_mu, new_p_binom, pred_cnv, min_prop_threshold=min_prop_threshold, EPS_BAF=EPS_BAF)
+    idx_diploid_normal = find_diploid_balanced_state(
+        new_log_mu,
+        new_p_binom,
+        pred_cnv,
+        min_prop_threshold=min_prop_threshold,
+        EPS_BAF=EPS_BAF,
+    )
     # candidate integer copy states
-    candidates = np.array([ [i,j] for i in range(max_allele_copy + 1) for j in range(max_allele_copy+1) if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)])
+    candidates = np.array(
+        [
+            [i, j]
+            for i in range(max_allele_copy + 1)
+            for j in range(max_allele_copy + 1)
+            if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)
+        ]
+    )
     # find the best copy number states starting from various ploidy
     best_obj = np.inf
     best_integer_copies = np.zeros((n_states, 2), dtype=int)
-    for ploidy in range(1, max_medploidy+1):
+    for ploidy in range(1, max_medploidy + 1):
         # initial_params = np.array([ [1,1] if not is_nondiploidnormal(k) else [1,0] for k in range(n_states)], dtype=int)
         np.random.seed(0)
         for r in range(20):
-            initial_params = candidates[ np.random.randint(low=0, high=candidates.shape[0], size=n_states), : ]
-            initial_params[idx_diploid_normal] = np.array([1,1])
-            for k,v in enforce_states.items():
+            initial_params = candidates[
+                np.random.randint(low=0, high=candidates.shape[0], size=n_states), :
+            ]
+            initial_params[idx_diploid_normal] = np.array([1, 1])
+            for k, v in enforce_states.items():
                 initial_params[k] = v
             params, obj = hill_climb(initial_params, ploidy, idx_diploid_normal)
             if obj < best_obj:
@@ -162,38 +214,66 @@ def hill_climb(initial_params, ploidy, idx_diploid_normal, max_iter=10):
     return best_integer_copies, best_obj
 
 
-def hill_climbing_integer_copynumber_oneclone(new_log_mu, base_nb_mean, new_p_binom, pred_cnv, max_allele_copy=5, max_total_copy=6, max_medploidy=4, enforce_states={}, EPS_BAF=0.05):
+def hill_climbing_integer_copynumber_oneclone(
+    new_log_mu,
+    base_nb_mean,
+    new_p_binom,
+    pred_cnv,
+    max_allele_copy=5,
+    max_total_copy=6,
+    max_medploidy=4,
+    enforce_states={},
+    EPS_BAF=0.05,
+):
     n_states = len(new_log_mu)
     lambd = base_nb_mean / np.sum(base_nb_mean)
-    weight_per_state = np.array([ np.sum(lambd[pred_cnv == s]) for s in range(n_states)])
+    weight_per_state = np.array([np.sum(lambd[pred_cnv == s]) for s in range(n_states)])
     mu = np.exp(new_log_mu)
     #
     EPS_POINTS = 0.1
+
     def f(params, ploidy):
         # params of size (n_states, 2)
-        if np.any( np.sum(params, axis=1) == 0 ):
+        if np.any(np.sum(params, axis=1) == 0):
             return len(pred_cnv) * 1e6
-        denom = weight_per_state.dot( np.sum(params, axis=1) )
+        denom = weight_per_state.dot(np.sum(params, axis=1))
         frac_rdr = np.sum(params, axis=1) / denom
-        frac_baf = params[:,0] / np.sum(params, axis=1)
-        points_per_state = np.bincount(pred_cnv, minlength=params.shape[0] ) + EPS_POINTS
+        frac_baf = params[:, 0] / np.sum(params, axis=1)
+        points_per_state = np.bincount(pred_cnv, minlength=params.shape[0]) + EPS_POINTS
         ### temp penalty ###
         mu_threshold = 0.3
-        crucial_ordered_pairs_1 = (mu[:,None] - mu[None,:] > mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] < 0)
-        crucial_ordered_pairs_2 = (mu[:,None] - mu[None,:] < -mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] > 0)
+        crucial_ordered_pairs_1 = (mu[:, None] - mu[None, :] > mu_threshold) * (
+            np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] < 0
+        )
+        crucial_ordered_pairs_2 = (mu[:, None] - mu[None, :] < -mu_threshold) * (
+            np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] > 0
+        )
         # penalty on setting unbalanced states when BAF is close to 0.5
-        if np.sum(params[:,0] == params[:,1]) > 0:
-            baf_threshold = max(EPS_BAF, np.max(np.abs(new_p_binom[(params[:,0]==params[:,1])] - 0.5)))
+        if np.sum(params[:, 0] == params[:, 1]) > 0:
+            baf_threshold = max(
+                EPS_BAF,
+                np.max(np.abs(new_p_binom[(params[:, 0] == params[:, 1])] - 0.5)),
+            )
         else:
             baf_threshold = EPS_BAF
-        unbalanced_penalty = (params[:,0] != params[:,1]).dot(np.abs(new_p_binom - 0.5) < baf_threshold)
+        unbalanced_penalty = (params[:, 0] != params[:, 1]).dot(
+            np.abs(new_p_binom - 0.5) < baf_threshold
+        )
         # penalty on ploidy
-        derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum(points_per_state, axis=0)
-        return np.square(0.3 * (mu - frac_rdr)).dot(points_per_state) + np.square(new_p_binom - frac_baf).dot(points_per_state) + \
-            np.sum(crucial_ordered_pairs_1) * len(pred_cnv) + np.sum(crucial_ordered_pairs_2) * len(pred_cnv) + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv) + \
-            unbalanced_penalty * len(pred_cnv)
+        derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum(
+            points_per_state, axis=0
+        )
+        return (
+            np.square(0.3 * (mu - frac_rdr)).dot(points_per_state)
+            + np.square(new_p_binom - frac_baf).dot(points_per_state)
+            + np.sum(crucial_ordered_pairs_1) * len(pred_cnv)
+            + np.sum(crucial_ordered_pairs_2) * len(pred_cnv)
+            + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv)
+            + unbalanced_penalty * len(pred_cnv)
+        )
         ### end temp penalty ###
         # return np.abs(mu - frac_rdr).dot(points_per_state) + 5 * np.abs(new_p_binom - frac_baf).dot(points_per_state)
+
     def hill_climb(initial_params, ploidy, max_iter=10):
         best_obj = f(initial_params, ploidy)
         params = copy.copy(initial_params)
@@ -204,29 +284,37 @@ def hill_climb(initial_params, ploidy, max_iter=10):
                 if k in enforce_states:
                     continue
                 this_best_obj = best_obj
-                this_best_k = copy.copy(params[k,:])
+                this_best_k = copy.copy(params[k, :])
                 for candi in candidates:
-                    params[k,:] = candi
+                    params[k, :] = candi
                     obj = f(params, ploidy)
                     if obj < this_best_obj:
                         # print(k, candi, obj, this_best_obj, ploidy+1, 0.1 * np.maximum(0, np.sum(params[k,:]) - ploidy-1) * np.sum(pred_cnv==k))
                         this_best_obj = obj
                         this_best_k = candi
-                increased = (increased | (this_best_obj < best_obj))
-                params[k,:] = this_best_k
+                increased = increased | (this_best_obj < best_obj)
+                params[k, :] = this_best_k
                 best_obj = this_best_obj
             if not increased:
                 break
         return params, best_obj
+
     # candidate integer copy states
-    candidates = np.array([ [i,j] for i in range(max_allele_copy + 1) for j in range(max_allele_copy+1) if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)])
+    candidates = np.array(
+        [
+            [i, j]
+            for i in range(max_allele_copy + 1)
+            for j in range(max_allele_copy + 1)
+            if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)
+        ]
+    )
     # find the best copy number states starting from various ploidy
     best_obj = np.inf
     best_integer_copies = np.zeros((n_states, 2), dtype=int)
-    for ploidy in range(1, max_medploidy+1):
+    for ploidy in range(1, max_medploidy + 1):
         initial_params = np.ones((n_states, 2), dtype=int) * int(ploidy / 2)
         initial_params[:, 1] = ploidy - initial_params[:, 0]
-        for k,v in enforce_states.items():
+        for k, v in enforce_states.items():
             initial_params[k] = v
         params, obj = hill_climb(initial_params, ploidy)
         if obj < best_obj:
@@ -235,10 +323,18 @@ def hill_climb(initial_params, ploidy, max_iter=10):
     return best_integer_copies, best_obj
 
 
-def hill_climbing_integer_copynumber_joint(new_log_mu, base_nb_mean, new_p_binom, pred_cnv, max_allele_copy=5, max_total_copy=6, max_medploidy=4):
+def hill_climbing_integer_copynumber_joint(
+    new_log_mu,
+    base_nb_mean,
+    new_p_binom,
+    pred_cnv,
+    max_allele_copy=5,
+    max_total_copy=6,
+    max_medploidy=4,
+):
     """
     Jointly infer copy numbers across multiple clones, given they share the same set of new_log_mu and new_p_binom parameters.
-    
+
     Attributes:
     ----------
     new_log_mu : array of size (n_states, n_clones)
@@ -255,27 +351,55 @@ def hill_climbing_integer_copynumber_joint(new_log_mu, base_nb_mean, new_p_binom
     """
     n_states = new_log_mu.shape[0]
     n_clones = base_nb_mean.shape[1]
-    lambd = np.sum(base_nb_mean,axis=1) / np.sum(base_nb_mean)
-    weight_per_state = np.array([[ np.sum(lambd[pred_cnv[:,c] == s]) for s in range(n_states)] for c in range(n_clones)]).T # size of (n_states, n_clones)
+    lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean)
+    weight_per_state = np.array(
+        [
+            [np.sum(lambd[pred_cnv[:, c] == s]) for s in range(n_states)]
+            for c in range(n_clones)
+        ]
+    ).T  # size of (n_states, n_clones)
     mu = np.exp(new_log_mu)
+
     def f(params, ploidy):
         # params of size (n_states, 2)
-        if np.any( np.sum(params, axis=1) == 0 ):
+        if np.any(np.sum(params, axis=1) == 0):
             return len(pred_cnv) * 1e6
-        denom = weight_per_state.T.dot( np.sum(params, axis=1) ) # size of (n_clones,)
-        frac_rdr = np.sum(params, axis=1).reshape(-1,1) / denom.reshape(1,-1) # size of (n_states, n_clones)
-        frac_baf = params[:,0] / np.sum(params, axis=1)
-        points_per_state = np.vstack([ np.bincount(pred_cnv[:,c], minlength=params.shape[0]) for c in range(n_clones) ]).T # size of (n_states, n_clones)
+        denom = weight_per_state.T.dot(np.sum(params, axis=1))  # size of (n_clones,)
+        frac_rdr = np.sum(params, axis=1).reshape(-1, 1) / denom.reshape(
+            1, -1
+        )  # size of (n_states, n_clones)
+        frac_baf = params[:, 0] / np.sum(params, axis=1)
+        points_per_state = np.vstack(
+            [
+                np.bincount(pred_cnv[:, c], minlength=params.shape[0])
+                for c in range(n_clones)
+            ]
+        ).T  # size of (n_states, n_clones)
         ### temp penalty ###
         mu_threshold = 0.3
-        crucial_ordered_pairs_1 = (mu[:,0][:,None] - mu[:,0][None,:] > mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] < 0)
-        crucial_ordered_pairs_2 = (mu[:,0][:,None] - mu[:,0][None,:] < -mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] > 0)
+        crucial_ordered_pairs_1 = (
+            mu[:, 0][:, None] - mu[:, 0][None, :] > mu_threshold
+        ) * (np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] < 0)
+        crucial_ordered_pairs_2 = (
+            mu[:, 0][:, None] - mu[:, 0][None, :] < -mu_threshold
+        ) * (np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] > 0)
         # penalty on ploidy
-        derived_ploidy = np.median(np.sum(params, axis=1).dot(points_per_state) / np.sum(points_per_state, axis=0))
-        return np.sum(np.square(0.3 * (mu - frac_rdr) * points_per_state)) + np.sum(np.square((new_p_binom - frac_baf).reshape(-1,1) * points_per_state)) + \
-            np.sum(crucial_ordered_pairs_1) * np.prod(pred_cnv.shape) + np.sum(crucial_ordered_pairs_2) * np.prod(pred_cnv.shape) + np.sum(derived_ploidy > ploidy + 0.5) * np.prod(pred_cnv.shape)
+        derived_ploidy = np.median(
+            np.sum(params, axis=1).dot(points_per_state)
+            / np.sum(points_per_state, axis=0)
+        )
+        return (
+            np.sum(np.square(0.3 * (mu - frac_rdr) * points_per_state))
+            + np.sum(
+                np.square((new_p_binom - frac_baf).reshape(-1, 1) * points_per_state)
+            )
+            + np.sum(crucial_ordered_pairs_1) * np.prod(pred_cnv.shape)
+            + np.sum(crucial_ordered_pairs_2) * np.prod(pred_cnv.shape)
+            + np.sum(derived_ploidy > ploidy + 0.5) * np.prod(pred_cnv.shape)
+        )
         ### end temp penalty ###
         # return np.abs(mu - frac_rdr).dot(points_per_state) + 5 * np.abs(new_p_binom - frac_baf).dot(points_per_state)
+
     def hill_climb(initial_params, ploidy, max_iter=10):
         best_obj = f(initial_params, ploidy)
         params = copy.copy(initial_params)
@@ -284,29 +408,37 @@ def hill_climb(initial_params, ploidy, max_iter=10):
             increased = False
             for k in range(params.shape[0]):
                 this_best_obj = best_obj
-                this_best_k = copy.copy(params[k,:])
+                this_best_k = copy.copy(params[k, :])
                 for candi in candidates:
-                    params[k,:] = candi
+                    params[k, :] = candi
                     obj = f(params, ploidy)
                     if obj < this_best_obj:
                         # print(k, candi, obj, this_best_obj, ploidy+1, 0.1 * np.maximum(0, np.sum(params[k,:]) - ploidy-1) * np.sum(pred_cnv==k))
                         this_best_obj = obj
                         this_best_k = candi
-                increased = (increased | (this_best_obj < best_obj))
-                params[k,:] = this_best_k
+                increased = increased | (this_best_obj < best_obj)
+                params[k, :] = this_best_k
                 best_obj = this_best_obj
             if not increased:
                 break
         return params, best_obj
+
     # candidate integer copy states
-    candidates = np.array([ [i,j] for i in range(max_allele_copy + 1) for j in range(max_allele_copy+1) if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)])
+    candidates = np.array(
+        [
+            [i, j]
+            for i in range(max_allele_copy + 1)
+            for j in range(max_allele_copy + 1)
+            if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)
+        ]
+    )
     # find the best copy number states starting from various ploidy
     best_obj = np.inf
     best_integer_copies = np.zeros((n_states, 2), dtype=int)
     # fix the genomic bin with the median new_log_mu to have exactly ploidy genomes
     # bidx_med = np.argsort(np.concatenate([ new_log_mu[pred_cnv[:,c],c] for c in range(n_clones) ]))[ int(len(pred_cnv.flatten())/2) ]
     # idx_med = pred_cnv.flatten(order="F")[bidx_med]
-    for ploidy in range(1, max_medploidy+1):
+    for ploidy in range(1, max_medploidy + 1):
         initial_params = np.ones((n_states, 2), dtype=int) * int(ploidy / 2)
         initial_params[:, 1] = ploidy - initial_params[:, 0]
         params, obj = hill_climb(initial_params, ploidy)
@@ -318,17 +450,19 @@ def hill_climb(initial_params, ploidy, max_iter=10):
 
 def get_genelevel_cnv_oneclone(A_copy, B_copy, x_gene_list):
     map_gene_bin = {}
-    for i,x in enumerate(x_gene_list):
+    for i, x in enumerate(x_gene_list):
         this_genes = [z for z in x.split(" ") if z != ""]
         for g in this_genes:
             map_gene_bin[g] = i
     gene_list = np.sort(np.array(list(map_gene_bin.keys())))
-    gene_level_copies = np.zeros( (len(gene_list), 2), dtype=int )
-    for i,g in enumerate(gene_list):
+    gene_level_copies = np.zeros((len(gene_list), 2), dtype=int)
+    for i, g in enumerate(gene_list):
         idx = map_gene_bin[g]
         gene_level_copies[i, 0] = A_copy[idx]
         gene_level_copies[i, 1] = B_copy[idx]
-    return pd.DataFrame({"A":gene_level_copies[:,0], "B":gene_level_copies[:,1]}, index=gene_list)
+    return pd.DataFrame(
+        {"A": gene_level_copies[:, 0], "B": gene_level_copies[:, 1]}, index=gene_list
+    )
 
 
 def convert_copy_to_states(A_copy, B_copy):
@@ -336,11 +470,11 @@ def convert_copy_to_states(A_copy, B_copy):
     tmp = tmp[~np.isnan(tmp)]
     base_ploidy = np.median(tmp)
     coarse_states = np.array(["neutral"] * A_copy.shape[0])
-    coarse_states[ (A_copy + B_copy < base_ploidy) & (A_copy != B_copy) ] = "del"
-    coarse_states[ (A_copy + B_copy < base_ploidy) & (A_copy == B_copy) ] = "bdel"
-    coarse_states[ (A_copy + B_copy > base_ploidy) & (A_copy != B_copy) ] = "amp"
-    coarse_states[ (A_copy + B_copy > base_ploidy) & (A_copy == B_copy) ] = "bamp"
-    coarse_states[ (A_copy + B_copy == base_ploidy) & (A_copy != B_copy) ] = "loh"
+    coarse_states[(A_copy + B_copy < base_ploidy) & (A_copy != B_copy)] = "del"
+    coarse_states[(A_copy + B_copy < base_ploidy) & (A_copy == B_copy)] = "bdel"
+    coarse_states[(A_copy + B_copy > base_ploidy) & (A_copy != B_copy)] = "amp"
+    coarse_states[(A_copy + B_copy > base_ploidy) & (A_copy == B_copy)] = "bamp"
+    coarse_states[(A_copy + B_copy == base_ploidy) & (A_copy != B_copy)] = "loh"
     coarse_states[coarse_states == "neutral"] = "neu"
     return coarse_states
 
@@ -677,4 +811,4 @@ def composite_hmm_eval_objective(base_nb_mean, total_bb_RD, new_log_mu, new_scal
 
 # except AttributeError:
 #     print('Encountered an attribute error')
-"""
\ No newline at end of file
+"""
diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index 2611340..2a262aa 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -19,8 +19,9 @@
 # whole inference
 ############################################################
 
+
 class hmm_nophasing(object):
-    def __init__(self, params="stmp", t=1-1e-4):
+    def __init__(self, params="stmp", t=1 - 1e-4):
         """
         Attributes
         ----------
@@ -32,9 +33,12 @@ def __init__(self, params="stmp", t=1-1e-4):
         """
         self.params = params
         self.t = t
+
     #
     @staticmethod
-    def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus):
+    def compute_emission_probability_nb_betabinom(
+        X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+    ):
         """
         Attributes
         ----------
@@ -58,7 +62,7 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
 
         taus : array, shape (n_states, n_spots)
             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-        
+
         Returns
         ----------
         log_emission : array, shape (n_states, n_obs, n_spots)
@@ -74,20 +78,40 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
-                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
-                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
                     n, p = convert_params(nb_mean, nb_std)
-                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
+                        X[idx_nonzero_rdr, 0, s], n, p
+                    )
                 # AF from BetaBinom distribution
-                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
-                    log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s])
+                    log_emission_baf[i, idx_nonzero_baf, s] = (
+                        scipy.stats.betabinom.logpmf(
+                            X[idx_nonzero_baf, 1, s],
+                            total_bb_RD[idx_nonzero_baf, s],
+                            p_binom[i, s] * taus[i, s],
+                            (1 - p_binom[i, s]) * taus[i, s],
+                        )
+                    )
         return log_emission_rdr, log_emission_baf
+
     #
     @staticmethod
-    def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs):
+    def compute_emission_probability_nb_betabinom_mix(
+        X,
+        base_nb_mean,
+        log_mu,
+        alphas,
+        total_bb_RD,
+        p_binom,
+        taus,
+        tumor_prop,
+        **kwargs,
+    ):
         """
         Attributes
         ----------
@@ -111,7 +135,7 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
 
         taus : array, shape (n_states, n_spots)
             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-        
+
         Returns
         ----------
         log_emission : array, shape (n_states, n_obs, n_spots)
@@ -127,27 +151,47 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
-                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
                     # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s])
-                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr, s] * (
+                        tumor_prop[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
+                        + 1
+                        - tumor_prop[idx_nonzero_rdr, s]
+                    )
                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
                     n, p = convert_params(nb_mean, nb_std)
-                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
+                        X[idx_nonzero_rdr, 0, s], n, p
+                    )
                 # AF from BetaBinom distribution
-                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
                     # mix_p_A = p_binom[i, s] * tumor_prop[s] + 0.5 * (1 - tumor_prop[s])
                     # mix_p_B = (1 - p_binom[i, s]) * tumor_prop[s] + 0.5 * (1 - tumor_prop[s])
-                    mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s])
-                    mix_p_B = (1 - p_binom[i, s]) * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s])
-                    log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s])
+                    mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf, s] + 0.5 * (
+                        1 - tumor_prop[idx_nonzero_baf, s]
+                    )
+                    mix_p_B = (1 - p_binom[i, s]) * tumor_prop[
+                        idx_nonzero_baf, s
+                    ] + 0.5 * (1 - tumor_prop[idx_nonzero_baf, s])
+                    log_emission_baf[
+                        i, idx_nonzero_baf, s
+                    ] += scipy.stats.betabinom.logpmf(
+                        X[idx_nonzero_baf, 1, s],
+                        total_bb_RD[idx_nonzero_baf, s],
+                        mix_p_A * taus[i, s],
+                        mix_p_B * taus[i, s],
+                    )
         return log_emission_rdr, log_emission_baf
+
     #
     @staticmethod
-    @njit 
-    def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-        '''
+    @njit
+    def forward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    ):
+        """
         Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
         Input
             lengths: sum of lengths = n_observations.
@@ -156,32 +200,43 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site
             log_emission: n_states * n_observations * n_spots. Log probability.
         Output
             log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda).
-        '''
+        """
         n_obs = log_emission.shape[1]
         n_states = log_emission.shape[0]
-        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        assert (
+            np.sum(lengths) == n_obs
+        ), "Sum of lengths must be equal to the first dimension of X!"
+        assert (
+            len(log_startprob) == n_states
+        ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
         # initialize log_alpha
         log_alpha = np.zeros((log_emission.shape[0], n_obs))
         buf = np.zeros(log_emission.shape[0])
         cumlen = 0
         for le in lengths:
             # start prob
-            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-            log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1)
+            log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(
+                log_emission[:, cumlen, :], axis=1
+            )
             for t in np.arange(1, le):
                 for j in np.arange(log_emission.shape[0]):
                     for i in np.arange(log_emission.shape[0]):
                         buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j]
-                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :])
+                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(
+                        log_emission[j, (cumlen + t), :]
+                    )
             cumlen += le
         return log_alpha
+
     #
     @staticmethod
-    @njit 
-    def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-        '''
+    @njit
+    def backward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    ):
+        """
         Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
         Input
             X: size n_observations * n_components * n_spots.
@@ -191,33 +246,61 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit
             log_emission: n_states * n_observations * n_spots. Log probability.
         Output
             log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda).
-        '''
+        """
         n_obs = log_emission.shape[1]
         n_states = log_emission.shape[0]
-        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        assert (
+            np.sum(lengths) == n_obs
+        ), "Sum of lengths must be equal to the first dimension of X!"
+        assert (
+            len(log_startprob) == n_states
+        ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
         # initialize log_beta
         log_beta = np.zeros((log_emission.shape[0], n_obs))
         buf = np.zeros(log_emission.shape[0])
         cumlen = 0
         for le in lengths:
             # start prob
-            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
             log_beta[:, (cumlen + le - 1)] = 0
-            for t in np.arange(le-2, -1, -1):
+            for t in np.arange(le - 2, -1, -1):
                 for i in np.arange(log_emission.shape[0]):
                     for j in np.arange(log_emission.shape[0]):
-                        buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :])
+                        buf[j] = (
+                            log_beta[j, (cumlen + t + 1)]
+                            + log_transmat[i, j]
+                            + np.sum(log_emission[j, (cumlen + t + 1), :])
+                        )
                     log_beta[i, (cumlen + t)] = mylogsumexp(buf)
             cumlen += le
         return log_beta
 
     #
-    def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, tp_weight_by_mu=None, \
-        fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-        is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs):
-        '''
+    def run_baum_welch_nb_bb(
+        self,
+        X,
+        lengths,
+        n_states,
+        base_nb_mean,
+        total_bb_RD,
+        log_sitewise_transmat=None,
+        tumor_prop=None,
+        tp_weight_by_mu=None,
+        fix_NB_dispersion=False,
+        shared_NB_dispersion=False,
+        fix_BB_dispersion=False,
+        shared_BB_dispersion=False,
+        is_diag=False,
+        init_log_mu=None,
+        init_p_binom=None,
+        init_alphas=None,
+        init_taus=None,
+        max_iter=100,
+        tol=1e-4,
+        **kwargs,
+    ):
+        """
         Input
             X: size n_observations * n_components * n_spots.
             lengths: sum of lengths = n_observations.
@@ -226,41 +309,84 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
         Intermediate
             log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
             alpha: size of n_states. Dispersioon parameter of each HMM state.
-        '''
+        """
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
         # initialize NB logmean shift and BetaBinom prob
-        log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu
-        p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom
+        log_mu = (
+            np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
+            if init_log_mu is None
+            else init_log_mu
+        )
+        p_binom = (
+            np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T
+            if init_p_binom is None
+            else init_p_binom
+        )
         # initialize (inverse of) dispersion param in NB and BetaBinom
-        alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        alphas = (
+            0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
         # initialize start probability and emission probability
-        log_startprob = np.log( np.ones(n_states) / n_states )
+        log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
-            transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1)
+            transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
-            log_transmat = np.zeros((1,1))
+            log_transmat = np.zeros((1, 1))
         # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
-        unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean)
-        unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD)
+        unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
+            X[:, 0, :], base_nb_mean
+        )
+        unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
+            X[:, 1, :], total_bb_RD
+        )
         # EM algorithm
         for r in trange(max_iter):
             # E step
             if tumor_prop is None:
-                log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+                log_emission_rdr, log_emission_baf = (
+                    hmm_nophasing.compute_emission_probability_nb_betabinom(
+                        X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+                    )
+                )
                 log_emission = log_emission_rdr + log_emission_baf
             else:
-                log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop)
+                log_emission_rdr, log_emission_baf = (
+                    hmm_nophasing.compute_emission_probability_nb_betabinom_mix(
+                        X,
+                        base_nb_mean,
+                        log_mu,
+                        alphas,
+                        total_bb_RD,
+                        p_binom,
+                        taus,
+                        tumor_prop,
+                    )
+                )
                 log_emission = log_emission_rdr + log_emission_baf
-            log_alpha = hmm_nophasing.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
-            log_beta = hmm_nophasing.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+            log_alpha = hmm_nophasing.forward_lattice(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
+            log_beta = hmm_nophasing.backward_lattice(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
-            log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission)
+            log_xi = compute_posterior_transition_nophasing(
+                log_alpha, log_beta, log_transmat, log_emission
+            )
             # M step
             if "s" in self.params:
                 new_log_startprob = update_startprob_nophasing(lengths, log_gamma)
@@ -273,32 +399,75 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
                 new_log_transmat = log_transmat
             if "m" in self.params:
                 if tumor_prop is None:
-                    new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    new_log_mu, new_alphas = (
+                        update_emission_params_nb_nophasing_uniqvalues(
+                            unique_values_nb,
+                            mapping_matrices_nb,
+                            log_gamma,
+                            alphas,
+                            start_log_mu=log_mu,
+                            fix_NB_dispersion=fix_NB_dispersion,
+                            shared_NB_dispersion=shared_NB_dispersion,
+                        )
+                    )
                 else:
-                    new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    new_log_mu, new_alphas = (
+                        update_emission_params_nb_nophasing_uniqvalues_mix(
+                            unique_values_nb,
+                            mapping_matrices_nb,
+                            log_gamma,
+                            alphas,
+                            tumor_prop,
+                            start_log_mu=log_mu,
+                            fix_NB_dispersion=fix_NB_dispersion,
+                            shared_NB_dispersion=shared_NB_dispersion,
+                        )
+                    )
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
             if "p" in self.params:
                 if tumor_prop is None:
-                    new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    new_p_binom, new_taus = (
+                        update_emission_params_bb_nophasing_uniqvalues(
+                            unique_values_bb,
+                            mapping_matrices_bb,
+                            log_gamma,
+                            taus,
+                            start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion,
+                            shared_BB_dispersion=shared_BB_dispersion,
+                        )
+                    )
                 else:
-                    new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, tumor_prop, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    new_p_binom, new_taus = (
+                        update_emission_params_bb_nophasing_uniqvalues_mix(
+                            unique_values_bb,
+                            mapping_matrices_bb,
+                            log_gamma,
+                            taus,
+                            tumor_prop,
+                            start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion,
+                            shared_BB_dispersion=shared_BB_dispersion,
+                        )
+                    )
             else:
                 new_p_binom = p_binom
                 new_taus = taus
             # check convergence
-            print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \
-                np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \
-                np.mean(np.abs(new_log_mu - log_mu)),\
-                np.mean(np.abs(new_p_binom - p_binom)) )
-            print( np.hstack([new_log_mu, new_p_binom]) )
-            if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \
-                np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol:
+            print(
+                np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
+                np.mean(np.abs(new_log_mu - log_mu)),
+                np.mean(np.abs(new_p_binom - p_binom)),
+            )
+            print(np.hstack([new_log_mu, new_p_binom]))
+            if (
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
+                and np.mean(np.abs(new_log_mu - log_mu)) < tol
+                and np.mean(np.abs(new_p_binom - p_binom)) < tol
+            ):
                 break
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
@@ -306,6 +475,12 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
             alphas = new_alphas
             p_binom = new_p_binom
             taus = new_taus
-        return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma
-
-
+        return (
+            new_log_mu,
+            new_alphas,
+            new_p_binom,
+            new_taus,
+            new_log_startprob,
+            new_log_transmat,
+            log_gamma,
+        )
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index d5a9145..2563834 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -22,8 +22,9 @@
 # whole inference
 ############################################################
 
+
 class hmm_nophasing_v2(object):
-    def __init__(self, params="stmp", t=1-1e-4):
+    def __init__(self, params="stmp", t=1 - 1e-4):
         """
         Attributes
         ----------
@@ -35,9 +36,12 @@ def __init__(self, params="stmp", t=1-1e-4):
         """
         self.params = params
         self.t = t
+
     #
     @staticmethod
-    def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus):
+    def compute_emission_probability_nb_betabinom(
+        X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+    ):
         """
         Attributes
         ----------
@@ -61,7 +65,7 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
 
         taus : array, shape (n_states, n_spots)
             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-        
+
         Returns
         ----------
         log_emission : array, shape (n_states, n_obs, n_spots)
@@ -77,20 +81,40 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
-                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
-                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
                     n, p = convert_params(nb_mean, nb_std)
-                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
+                        X[idx_nonzero_rdr, 0, s], n, p
+                    )
                 # AF from BetaBinom distribution
-                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
-                    log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s])
+                    log_emission_baf[i, idx_nonzero_baf, s] = (
+                        scipy.stats.betabinom.logpmf(
+                            X[idx_nonzero_baf, 1, s],
+                            total_bb_RD[idx_nonzero_baf, s],
+                            p_binom[i, s] * taus[i, s],
+                            (1 - p_binom[i, s]) * taus[i, s],
+                        )
+                    )
         return log_emission_rdr, log_emission_baf
+
     #
     @staticmethod
-    def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs):
+    def compute_emission_probability_nb_betabinom_mix(
+        X,
+        base_nb_mean,
+        log_mu,
+        alphas,
+        total_bb_RD,
+        p_binom,
+        taus,
+        tumor_prop,
+        **kwargs,
+    ):
         """
         Attributes
         ----------
@@ -114,7 +138,7 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
 
         taus : array, shape (n_states, n_spots)
             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-        
+
         Returns
         ----------
         log_emission : array, shape (n_states, n_obs, n_spots)
@@ -130,34 +154,63 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
-                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
                     # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s])
-                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr, s] * (
+                        tumor_prop[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
+                        + 1
+                        - tumor_prop[idx_nonzero_rdr, s]
+                    )
                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
                     n, p = convert_params(nb_mean, nb_std)
-                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
+                        X[idx_nonzero_rdr, 0, s], n, p
+                    )
                 # AF from BetaBinom distribution
                 if ("logmu_shift" in kwargs) and ("sample_length" in kwargs):
                     this_weighted_tp = []
                     for c in range(len(kwargs["sample_length"])):
                         range_s = np.sum(kwargs["sample_length"][:c])
-                        range_t = np.sum(kwargs["sample_length"][:(c+1)])
-                        this_weighted_tp.append( tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) / (tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) + 1 - tumor_prop[range_s:range_t,s]) )
+                        range_t = np.sum(kwargs["sample_length"][: (c + 1)])
+                        this_weighted_tp.append(
+                            tumor_prop[range_s:range_t, s]
+                            * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c, s])
+                            / (
+                                tumor_prop[range_s:range_t, s]
+                                * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c, s])
+                                + 1
+                                - tumor_prop[range_s:range_t, s]
+                            )
+                        )
                     this_weighted_tp = np.concatenate(this_weighted_tp)
                 else:
-                    this_weighted_tp = tumor_prop[:,s]
-                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                    this_weighted_tp = tumor_prop[:, s]
+                idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
-                    mix_p_A = p_binom[i, s] * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
-                    mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
-                    log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s])
+                    mix_p_A = p_binom[i, s] * this_weighted_tp[
+                        idx_nonzero_baf
+                    ] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
+                    mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[
+                        idx_nonzero_baf
+                    ] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
+                    log_emission_baf[
+                        i, idx_nonzero_baf, s
+                    ] += scipy.stats.betabinom.logpmf(
+                        X[idx_nonzero_baf, 1, s],
+                        total_bb_RD[idx_nonzero_baf, s],
+                        mix_p_A * taus[i, s],
+                        mix_p_B * taus[i, s],
+                    )
         return log_emission_rdr, log_emission_baf
+
     #
     @staticmethod
-    @njit 
-    def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-        '''
+    @njit
+    def forward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    ):
+        """
         Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
         Input
             lengths: sum of lengths = n_observations.
@@ -166,32 +219,43 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site
             log_emission: n_states * n_observations * n_spots. Log probability.
         Output
             log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda).
-        '''
+        """
         n_obs = log_emission.shape[1]
         n_states = log_emission.shape[0]
-        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        assert (
+            np.sum(lengths) == n_obs
+        ), "Sum of lengths must be equal to the first dimension of X!"
+        assert (
+            len(log_startprob) == n_states
+        ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
         # initialize log_alpha
         log_alpha = np.zeros((log_emission.shape[0], n_obs))
         buf = np.zeros(log_emission.shape[0])
         cumlen = 0
         for le in lengths:
             # start prob
-            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-            log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1)
+            log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(
+                log_emission[:, cumlen, :], axis=1
+            )
             for t in np.arange(1, le):
                 for j in np.arange(log_emission.shape[0]):
                     for i in np.arange(log_emission.shape[0]):
                         buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j]
-                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :])
+                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(
+                        log_emission[j, (cumlen + t), :]
+                    )
             cumlen += le
         return log_alpha
+
     #
     @staticmethod
-    @njit 
-    def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-        '''
+    @njit
+    def backward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    ):
+        """
         Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
         Input
             X: size n_observations * n_components * n_spots.
@@ -201,33 +265,60 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit
             log_emission: n_states * n_observations * n_spots. Log probability.
         Output
             log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda).
-        '''
+        """
         n_obs = log_emission.shape[1]
         n_states = log_emission.shape[0]
-        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        assert (
+            np.sum(lengths) == n_obs
+        ), "Sum of lengths must be equal to the first dimension of X!"
+        assert (
+            len(log_startprob) == n_states
+        ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
         # initialize log_beta
         log_beta = np.zeros((log_emission.shape[0], n_obs))
         buf = np.zeros(log_emission.shape[0])
         cumlen = 0
         for le in lengths:
             # start prob
-            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
             log_beta[:, (cumlen + le - 1)] = 0
-            for t in np.arange(le-2, -1, -1):
+            for t in np.arange(le - 2, -1, -1):
                 for i in np.arange(log_emission.shape[0]):
                     for j in np.arange(log_emission.shape[0]):
-                        buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :])
+                        buf[j] = (
+                            log_beta[j, (cumlen + t + 1)]
+                            + log_transmat[i, j]
+                            + np.sum(log_emission[j, (cumlen + t + 1), :])
+                        )
                     log_beta[i, (cumlen + t)] = mylogsumexp(buf)
             cumlen += le
         return log_beta
 
     #
-    def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, \
-        fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-        is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs):
-        '''
+    def run_baum_welch_nb_bb(
+        self,
+        X,
+        lengths,
+        n_states,
+        base_nb_mean,
+        total_bb_RD,
+        log_sitewise_transmat=None,
+        tumor_prop=None,
+        fix_NB_dispersion=False,
+        shared_NB_dispersion=False,
+        fix_BB_dispersion=False,
+        shared_BB_dispersion=False,
+        is_diag=False,
+        init_log_mu=None,
+        init_p_binom=None,
+        init_alphas=None,
+        init_taus=None,
+        max_iter=100,
+        tol=1e-4,
+        **kwargs,
+    ):
+        """
         Input
             X: size n_observations * n_components * n_spots.
             lengths: sum of lengths = n_observations.
@@ -236,52 +327,125 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
         Intermediate
             log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
             alpha: size of n_states. Dispersioon parameter of each HMM state.
-        '''
+        """
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
         # initialize NB logmean shift and BetaBinom prob
-        log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu
-        p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom
+        log_mu = (
+            np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
+            if init_log_mu is None
+            else init_log_mu
+        )
+        p_binom = (
+            np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T
+            if init_p_binom is None
+            else init_p_binom
+        )
         # initialize (inverse of) dispersion param in NB and BetaBinom
-        alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        alphas = (
+            0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
         # initialize start probability and emission probability
-        log_startprob = np.log( np.ones(n_states) / n_states )
+        log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
-            transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1)
+            transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
-            log_transmat = np.zeros((1,1))
+            log_transmat = np.zeros((1, 1))
         # initialize log_gamma
         log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None
         # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
-        unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean)
-        unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD)
+        unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
+            X[:, 0, :], base_nb_mean
+        )
+        unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
+            X[:, 1, :], total_bb_RD
+        )
         # EM algorithm
         for r in trange(max_iter):
             # E step
             if tumor_prop is None:
-                log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+                log_emission_rdr, log_emission_baf = (
+                    hmm_nophasing_v2.compute_emission_probability_nb_betabinom(
+                        X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+                    )
+                )
                 log_emission = log_emission_rdr + log_emission_baf
             else:
                 # compute mu as adjusted RDR
                 if ((not log_gamma is None) or (r > 0)) and ("m" in self.params):
                     logmu_shift = []
                     for c in range(len(kwargs["sample_length"])):
-                        this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states
-                        logmu_shift.append( scipy.special.logsumexp(log_mu[this_pred_cnv,:] + np.log(kwargs["lambd"]).reshape(-1,1), axis=0) )
+                        this_pred_cnv = (
+                            np.argmax(
+                                log_gamma[
+                                    :,
+                                    np.sum(kwargs["sample_length"][:c]) : np.sum(
+                                        kwargs["sample_length"][: (c + 1)]
+                                    ),
+                                ],
+                                axis=0,
+                            )
+                            % n_states
+                        )
+                        logmu_shift.append(
+                            scipy.special.logsumexp(
+                                log_mu[this_pred_cnv, :]
+                                + np.log(kwargs["lambd"]).reshape(-1, 1),
+                                axis=0,
+                            )
+                        )
                     logmu_shift = np.vstack(logmu_shift)
-                    log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, logmu_shift=logmu_shift, sample_length=kwargs["sample_length"])
+                    log_emission_rdr, log_emission_baf = (
+                        hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix(
+                            X,
+                            base_nb_mean,
+                            log_mu,
+                            alphas,
+                            total_bb_RD,
+                            p_binom,
+                            taus,
+                            tumor_prop,
+                            logmu_shift=logmu_shift,
+                            sample_length=kwargs["sample_length"],
+                        )
+                    )
                 else:
-                    log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop)
+                    log_emission_rdr, log_emission_baf = (
+                        hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix(
+                            X,
+                            base_nb_mean,
+                            log_mu,
+                            alphas,
+                            total_bb_RD,
+                            p_binom,
+                            taus,
+                            tumor_prop,
+                        )
+                    )
                 log_emission = log_emission_rdr + log_emission_baf
-            log_alpha = hmm_nophasing_v2.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
-            log_beta = hmm_nophasing_v2.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+            log_alpha = hmm_nophasing_v2.forward_lattice(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
+            log_beta = hmm_nophasing_v2.backward_lattice(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
-            log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission)
+            log_xi = compute_posterior_transition_nophasing(
+                log_alpha, log_beta, log_transmat, log_emission
+            )
             # M step
             if "s" in self.params:
                 new_log_startprob = update_startprob_nophasing(lengths, log_gamma)
@@ -294,42 +458,106 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
                 new_log_transmat = log_transmat
             if "m" in self.params:
                 if tumor_prop is None:
-                    new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    new_log_mu, new_alphas = (
+                        update_emission_params_nb_nophasing_uniqvalues(
+                            unique_values_nb,
+                            mapping_matrices_nb,
+                            log_gamma,
+                            alphas,
+                            start_log_mu=log_mu,
+                            fix_NB_dispersion=fix_NB_dispersion,
+                            shared_NB_dispersion=shared_NB_dispersion,
+                        )
+                    )
                 else:
-                    new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    new_log_mu, new_alphas = (
+                        update_emission_params_nb_nophasing_uniqvalues_mix(
+                            unique_values_nb,
+                            mapping_matrices_nb,
+                            log_gamma,
+                            alphas,
+                            tumor_prop,
+                            start_log_mu=log_mu,
+                            fix_NB_dispersion=fix_NB_dispersion,
+                            shared_NB_dispersion=shared_NB_dispersion,
+                        )
+                    )
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
             if "p" in self.params:
                 if tumor_prop is None:
-                    new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    new_p_binom, new_taus = (
+                        update_emission_params_bb_nophasing_uniqvalues(
+                            unique_values_bb,
+                            mapping_matrices_bb,
+                            log_gamma,
+                            taus,
+                            start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion,
+                            shared_BB_dispersion=shared_BB_dispersion,
+                        )
+                    )
                 else:
                     # compute mu as adjusted RDR
-                    if ("m" in self.params):
+                    if "m" in self.params:
                         mu = []
                         for c in range(len(kwargs["sample_length"])):
-                            this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states
-                            mu.append( np.exp(new_log_mu[this_pred_cnv,:]) / np.sum(np.exp(new_log_mu[this_pred_cnv,:]) * kwargs["lambd"].reshape(-1,1), axis=0, keepdims=True) )
+                            this_pred_cnv = (
+                                np.argmax(
+                                    log_gamma[
+                                        :,
+                                        np.sum(kwargs["sample_length"][:c]) : np.sum(
+                                            kwargs["sample_length"][: (c + 1)]
+                                        ),
+                                    ],
+                                    axis=0,
+                                )
+                                % n_states
+                            )
+                            mu.append(
+                                np.exp(new_log_mu[this_pred_cnv, :])
+                                / np.sum(
+                                    np.exp(new_log_mu[this_pred_cnv, :])
+                                    * kwargs["lambd"].reshape(-1, 1),
+                                    axis=0,
+                                    keepdims=True,
+                                )
+                            )
                         mu = np.vstack(mu)
-                        weighted_tp = (tumor_prop * mu) / (tumor_prop * mu + 1 - tumor_prop)
+                        weighted_tp = (tumor_prop * mu) / (
+                            tumor_prop * mu + 1 - tumor_prop
+                        )
                     else:
                         weighted_tp = tumor_prop
-                    new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    new_p_binom, new_taus = (
+                        update_emission_params_bb_nophasing_uniqvalues_mix(
+                            unique_values_bb,
+                            mapping_matrices_bb,
+                            log_gamma,
+                            taus,
+                            weighted_tp,
+                            start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion,
+                            shared_BB_dispersion=shared_BB_dispersion,
+                        )
+                    )
             else:
                 new_p_binom = p_binom
                 new_taus = taus
             # check convergence
-            print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \
-                np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \
-                np.mean(np.abs(new_log_mu - log_mu)),\
-                np.mean(np.abs(new_p_binom - p_binom)) )
-            print( np.hstack([new_log_mu, new_p_binom]) )
-            if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \
-                np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol:
+            print(
+                np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
+                np.mean(np.abs(new_log_mu - log_mu)),
+                np.mean(np.abs(new_p_binom - p_binom)),
+            )
+            print(np.hstack([new_log_mu, new_p_binom]))
+            if (
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
+                and np.mean(np.abs(new_log_mu - log_mu)) < tol
+                and np.mean(np.abs(new_p_binom - p_binom)) < tol
+            ):
                 break
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
@@ -337,6 +565,12 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
             alphas = new_alphas
             p_binom = new_p_binom
             taus = new_taus
-        return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma
-
-
+        return (
+            new_log_mu,
+            new_alphas,
+            new_p_binom,
+            new_taus,
+            new_log_startprob,
+            new_log_transmat,
+            log_gamma,
+        )
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 630651f..0d26b70 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -21,8 +21,9 @@
 # whole inference
 ############################################################
 
+
 class hmm_sitewise(object):
-    def __init__(self, params="stmp", t=1-1e-4):
+    def __init__(self, params="stmp", t=1 - 1e-4):
         """
         Attributes
         ----------
@@ -34,9 +35,12 @@ def __init__(self, params="stmp", t=1-1e-4):
         """
         self.params = params
         self.t = t
+
     #
     @staticmethod
-    def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus):
+    def compute_emission_probability_nb_betabinom(
+        X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+    ):
         """
         Attributes
         ----------
@@ -60,7 +64,7 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
 
         taus : array, shape (n_states, n_spots)
             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-        
+
         Returns
         ----------
         log_emission : array, shape (2*n_states, n_obs, n_spots)
@@ -76,22 +80,51 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
-                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
-                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
                     n, p = convert_params(nb_mean, nb_std)
-                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
-                    log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = log_emission_rdr[i, idx_nonzero_rdr, s]
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
+                        X[idx_nonzero_rdr, 0, s], n, p
+                    )
+                    log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = (
+                        log_emission_rdr[i, idx_nonzero_rdr, s]
+                    )
                 # AF from BetaBinom distribution
-                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
-                    log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s])
-                    log_emission_baf[i + n_states, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], (1-p_binom[i, s]) * taus[i, s], p_binom[i, s] * taus[i, s])
+                    log_emission_baf[i, idx_nonzero_baf, s] = (
+                        scipy.stats.betabinom.logpmf(
+                            X[idx_nonzero_baf, 1, s],
+                            total_bb_RD[idx_nonzero_baf, s],
+                            p_binom[i, s] * taus[i, s],
+                            (1 - p_binom[i, s]) * taus[i, s],
+                        )
+                    )
+                    log_emission_baf[i + n_states, idx_nonzero_baf, s] = (
+                        scipy.stats.betabinom.logpmf(
+                            X[idx_nonzero_baf, 1, s],
+                            total_bb_RD[idx_nonzero_baf, s],
+                            (1 - p_binom[i, s]) * taus[i, s],
+                            p_binom[i, s] * taus[i, s],
+                        )
+                    )
         return log_emission_rdr, log_emission_baf
+
     #
     @staticmethod
-    def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs):
+    def compute_emission_probability_nb_betabinom_mix(
+        X,
+        base_nb_mean,
+        log_mu,
+        alphas,
+        total_bb_RD,
+        p_binom,
+        taus,
+        tumor_prop,
+        **kwargs,
+    ):
         """
         Attributes
         ----------
@@ -115,7 +148,7 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
 
         taus : array, shape (n_states, n_spots)
             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-        
+
         Returns
         ----------
         log_emission : array, shape (2*n_states, n_obs, n_spots)
@@ -131,26 +164,55 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
-                idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
-                    nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s])
+                    nb_mean = base_nb_mean[idx_nonzero_rdr, s] * (
+                        tumor_prop[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
+                        + 1
+                        - tumor_prop[idx_nonzero_rdr, s]
+                    )
                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
                     n, p = convert_params(nb_mean, nb_std)
-                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
-                    log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = log_emission_rdr[i, idx_nonzero_rdr, s]
+                    log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
+                        X[idx_nonzero_rdr, 0, s], n, p
+                    )
+                    log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = (
+                        log_emission_rdr[i, idx_nonzero_rdr, s]
+                    )
                 # AF from BetaBinom distribution
-                idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
+                idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
-                    mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s])
-                    mix_p_B = (1 - p_binom[i, s]) * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s])
-                    log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s])
-                    log_emission_baf[i + n_states, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_B * taus[i, s], mix_p_A * taus[i, s])
+                    mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf, s] + 0.5 * (
+                        1 - tumor_prop[idx_nonzero_baf, s]
+                    )
+                    mix_p_B = (1 - p_binom[i, s]) * tumor_prop[
+                        idx_nonzero_baf, s
+                    ] + 0.5 * (1 - tumor_prop[idx_nonzero_baf, s])
+                    log_emission_baf[
+                        i, idx_nonzero_baf, s
+                    ] += scipy.stats.betabinom.logpmf(
+                        X[idx_nonzero_baf, 1, s],
+                        total_bb_RD[idx_nonzero_baf, s],
+                        mix_p_A * taus[i, s],
+                        mix_p_B * taus[i, s],
+                    )
+                    log_emission_baf[
+                        i + n_states, idx_nonzero_baf, s
+                    ] += scipy.stats.betabinom.logpmf(
+                        X[idx_nonzero_baf, 1, s],
+                        total_bb_RD[idx_nonzero_baf, s],
+                        mix_p_B * taus[i, s],
+                        mix_p_A * taus[i, s],
+                    )
         return log_emission_rdr, log_emission_baf
+
     #
     @staticmethod
-    @njit 
-    def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-        '''
+    @njit
+    def forward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    ):
+        """
         Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs.
         Input
             lengths: sum of lengths = n_observations.
@@ -160,11 +222,15 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site
             log_sitewise_transmat: n_observations, the log transition probability of phase switch.
         Output
             log_alpha: size 2n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda).
-        '''
+        """
         n_obs = log_emission.shape[1]
         n_states = int(np.ceil(log_emission.shape[0] / 2))
-        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        assert (
+            np.sum(lengths) == n_obs
+        ), "Sum of lengths must be equal to the first dimension of X!"
+        assert (
+            len(log_startprob) == n_states
+        ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
         log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat))
         # initialize log_alpha
         log_alpha = np.zeros((log_emission.shape[0], n_obs))
@@ -172,25 +238,49 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site
         cumlen = 0
         for le in lengths:
             # start prob
-            combined_log_startprob = np.log(0.5) + np.append(log_startprob,log_startprob)
-            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            combined_log_startprob = np.log(0.5) + np.append(
+                log_startprob, log_startprob
+            )
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-            log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1)
+            log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze(
+                log_emission[:, cumlen, :], axis=1
+            )
             for t in np.arange(1, le):
-                phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t-1], log_sitewise_transmat[cumlen + t-1]], [log_sitewise_transmat[cumlen + t-1], log_sitewise_self_transmat[cumlen + t-1] ]])
-                combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) )
+                phases_switch_mat = np.array(
+                    [
+                        [
+                            log_sitewise_self_transmat[cumlen + t - 1],
+                            log_sitewise_transmat[cumlen + t - 1],
+                        ],
+                        [
+                            log_sitewise_transmat[cumlen + t - 1],
+                            log_sitewise_self_transmat[cumlen + t - 1],
+                        ],
+                    ]
+                )
+                combined_transmat = np.kron(
+                    np.exp(phases_switch_mat), np.exp(log_transmat)
+                )
                 combined_transmat = np.log(combined_transmat)
                 for j in np.arange(log_emission.shape[0]):
                     for i in np.arange(log_emission.shape[0]):
-                        buf[i] = log_alpha[i, (cumlen + t - 1)] + combined_transmat[i, j]
-                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :])
+                        buf[i] = (
+                            log_alpha[i, (cumlen + t - 1)] + combined_transmat[i, j]
+                        )
+                    log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(
+                        log_emission[j, (cumlen + t), :]
+                    )
             cumlen += le
         return log_alpha
+
     #
     @staticmethod
     @njit
-    def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-        '''
+    def backward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    ):
+        """
         Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs.
         Input
             X: size n_observations * n_components * n_spots.
@@ -201,11 +291,15 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit
             log_sitewise_transmat: n_observations, the log transition probability of phase switch.
         Output
             log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda).
-        '''
+        """
         n_obs = log_emission.shape[1]
         n_states = int(np.ceil(log_emission.shape[0] / 2))
-        assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-        assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+        assert (
+            np.sum(lengths) == n_obs
+        ), "Sum of lengths must be equal to the first dimension of X!"
+        assert (
+            len(log_startprob) == n_states
+        ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
         log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat))
         # initialize log_beta
         log_beta = np.zeros((log_emission.shape[0], n_obs))
@@ -213,24 +307,60 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit
         cumlen = 0
         for le in lengths:
             # start prob
-            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
             log_beta[:, (cumlen + le - 1)] = 0
-            for t in np.arange(le-2, -1, -1):
-                phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t], log_sitewise_transmat[cumlen + t]], [log_sitewise_transmat[cumlen + t], log_sitewise_self_transmat[cumlen + t] ]])
-                combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) )
+            for t in np.arange(le - 2, -1, -1):
+                phases_switch_mat = np.array(
+                    [
+                        [
+                            log_sitewise_self_transmat[cumlen + t],
+                            log_sitewise_transmat[cumlen + t],
+                        ],
+                        [
+                            log_sitewise_transmat[cumlen + t],
+                            log_sitewise_self_transmat[cumlen + t],
+                        ],
+                    ]
+                )
+                combined_transmat = np.kron(
+                    np.exp(phases_switch_mat), np.exp(log_transmat)
+                )
                 combined_transmat = np.log(combined_transmat)
                 for i in np.arange(log_emission.shape[0]):
                     for j in np.arange(log_emission.shape[0]):
-                        buf[j] = log_beta[j, (cumlen + t + 1)] + combined_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :])
+                        buf[j] = (
+                            log_beta[j, (cumlen + t + 1)]
+                            + combined_transmat[i, j]
+                            + np.sum(log_emission[j, (cumlen + t + 1), :])
+                        )
                     log_beta[i, (cumlen + t)] = mylogsumexp(buf)
             cumlen += le
         return log_beta
+
     #
-    def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop=None, \
-        fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-        is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4):
-        '''
+    def run_baum_welch_nb_bb(
+        self,
+        X,
+        lengths,
+        n_states,
+        base_nb_mean,
+        total_bb_RD,
+        log_sitewise_transmat,
+        tumor_prop=None,
+        fix_NB_dispersion=False,
+        shared_NB_dispersion=False,
+        fix_BB_dispersion=False,
+        shared_BB_dispersion=False,
+        is_diag=False,
+        init_log_mu=None,
+        init_p_binom=None,
+        init_alphas=None,
+        init_taus=None,
+        max_iter=100,
+        tol=1e-4,
+    ):
+        """
         Input
             X: size n_observations * n_components * n_spots.
             lengths: sum of lengths = n_observations.
@@ -239,41 +369,84 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
         Intermediate
             log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
             alpha: size of n_states. Dispersioon parameter of each HMM state.
-        '''
+        """
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
         # initialize NB logmean shift and BetaBinom prob
-        log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu
-        p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom
+        log_mu = (
+            np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
+            if init_log_mu is None
+            else init_log_mu
+        )
+        p_binom = (
+            np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T
+            if init_p_binom is None
+            else init_p_binom
+        )
         # initialize (inverse of) dispersion param in NB and BetaBinom
-        alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        alphas = (
+            0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
+        )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
         # initialize start probability and emission probability
-        log_startprob = np.log( np.ones(n_states) / n_states )
+        log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
-            transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1)
+            transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
-            log_transmat = np.zeros((1,1))
+            log_transmat = np.zeros((1, 1))
         # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
-        unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean)
-        unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD)
+        unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
+            X[:, 0, :], base_nb_mean
+        )
+        unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
+            X[:, 1, :], total_bb_RD
+        )
         # EM algorithm
         for r in trange(max_iter):
             # E step
             if tumor_prop is None:
-                log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+                log_emission_rdr, log_emission_baf = (
+                    hmm_sitewise.compute_emission_probability_nb_betabinom(
+                        X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+                    )
+                )
                 log_emission = log_emission_rdr + log_emission_baf
             else:
-                log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop)
+                log_emission_rdr, log_emission_baf = (
+                    hmm_sitewise.compute_emission_probability_nb_betabinom_mix(
+                        X,
+                        base_nb_mean,
+                        log_mu,
+                        alphas,
+                        total_bb_RD,
+                        p_binom,
+                        taus,
+                        tumor_prop,
+                    )
+                )
                 log_emission = log_emission_rdr + log_emission_baf
-            log_alpha = hmm_sitewise.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
-            log_beta = hmm_sitewise.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+            log_alpha = hmm_sitewise.forward_lattice(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
+            log_beta = hmm_sitewise.backward_lattice(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
-            log_xi = compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission)
+            log_xi = compute_posterior_transition_sitewise(
+                log_alpha, log_beta, log_transmat, log_emission
+            )
             # M step
             if "s" in self.params:
                 new_log_startprob = update_startprob_sitewise(lengths, log_gamma)
@@ -288,32 +461,79 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
                 # new_log_mu, new_alphas = update_emission_params_nb_sitewise(X[:,0,:], log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \
                 #     fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
                 if tumor_prop is None:
-                    new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    new_log_mu, new_alphas = (
+                        update_emission_params_nb_sitewise_uniqvalues(
+                            unique_values_nb,
+                            mapping_matrices_nb,
+                            log_gamma,
+                            base_nb_mean,
+                            alphas,
+                            start_log_mu=log_mu,
+                            fix_NB_dispersion=fix_NB_dispersion,
+                            shared_NB_dispersion=shared_NB_dispersion,
+                        )
+                    )
                 else:
-                    new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \
-                        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
+                    new_log_mu, new_alphas = (
+                        update_emission_params_nb_sitewise_uniqvalues_mix(
+                            unique_values_nb,
+                            mapping_matrices_nb,
+                            log_gamma,
+                            base_nb_mean,
+                            alphas,
+                            tumor_prop,
+                            start_log_mu=log_mu,
+                            fix_NB_dispersion=fix_NB_dispersion,
+                            shared_NB_dispersion=shared_NB_dispersion,
+                        )
+                    )
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
             if "p" in self.params:
                 if tumor_prop is None:
-                    new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    new_p_binom, new_taus = (
+                        update_emission_params_bb_sitewise_uniqvalues(
+                            unique_values_bb,
+                            mapping_matrices_bb,
+                            log_gamma,
+                            total_bb_RD,
+                            taus,
+                            start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion,
+                            shared_BB_dispersion=shared_BB_dispersion,
+                        )
+                    )
                 else:
-                    new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \
-                        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
+                    new_p_binom, new_taus = (
+                        update_emission_params_bb_sitewise_uniqvalues_mix(
+                            unique_values_bb,
+                            mapping_matrices_bb,
+                            log_gamma,
+                            total_bb_RD,
+                            taus,
+                            tumor_prop,
+                            start_p_binom=p_binom,
+                            fix_BB_dispersion=fix_BB_dispersion,
+                            shared_BB_dispersion=shared_BB_dispersion,
+                        )
+                    )
             else:
                 new_p_binom = p_binom
                 new_taus = taus
             # check convergence
-            print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \
-                np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \
-                np.mean(np.abs(new_log_mu - log_mu)),\
-                np.mean(np.abs(new_p_binom - p_binom)) )
-            print( np.hstack([new_log_mu, new_p_binom]) )
-            if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \
-                np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol:
+            print(
+                np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
+                np.mean(np.abs(new_log_mu - log_mu)),
+                np.mean(np.abs(new_p_binom - p_binom)),
+            )
+            print(np.hstack([new_log_mu, new_p_binom]))
+            if (
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
+                and np.mean(np.abs(new_log_mu - log_mu)) < tol
+                and np.mean(np.abs(new_p_binom - p_binom)) < tol
+            ):
                 break
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
@@ -321,10 +541,30 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
             alphas = new_alphas
             p_binom = new_p_binom
             taus = new_taus
-        return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma
+        return (
+            new_log_mu,
+            new_alphas,
+            new_p_binom,
+            new_taus,
+            new_log_startprob,
+            new_log_transmat,
+            log_gamma,
+        )
 
 
-def posterior_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, log_startprob, log_transmat, log_sitewise_transmat):
+def posterior_nb_bb_sitewise(
+    X,
+    lengths,
+    base_nb_mean,
+    log_mu,
+    alphas,
+    total_bb_RD,
+    p_binom,
+    taus,
+    log_startprob,
+    log_transmat,
+    log_sitewise_transmat,
+):
     """
     Attributes
     ----------
@@ -361,15 +601,35 @@ def posterior_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_
     log_sitewise_transmat : array, shape (n_observations)
         Log of phase switch probability of each gene (or bin).
     """
-    log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+    log_emission_rdr, log_emission_baf = (
+        hmm_sitewise.compute_emission_probability_nb_betabinom(
+            X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+        )
+    )
     log_emission = log_emission_rdr + log_emission_baf
-    log_alpha = hmm_sitewise.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
-    log_beta = hmm_sitewise.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+    log_alpha = hmm_sitewise.forward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    )
+    log_beta = hmm_sitewise.backward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    )
     log_gamma = compute_posterior_obs(log_alpha, log_beta)
     return log_gamma
 
 
-def loglikelihood_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, log_startprob, log_transmat, log_sitewise_transmat):
+def loglikelihood_nb_bb_sitewise(
+    X,
+    lengths,
+    base_nb_mean,
+    log_mu,
+    alphas,
+    total_bb_RD,
+    p_binom,
+    taus,
+    log_startprob,
+    log_transmat,
+    log_sitewise_transmat,
+):
     """
     Attributes
     ----------
@@ -406,85 +666,150 @@ def loglikelihood_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total
     log_sitewise_transmat : array, shape (n_observations)
         Log of phase switch probability of each gene (or bin).
     """
-    log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+    log_emission_rdr, log_emission_baf = (
+        hmm_sitewise.compute_emission_probability_nb_betabinom(
+            X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+        )
+    )
     log_emission = log_emission_rdr + log_emission_baf
-    log_alpha = hmm_sitewise.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
-    return np.sum(scipy.special.logsumexp(log_alpha[:,np.cumsum(lengths)-1], axis=0)), log_alpha
-
-
-def viterbi_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, log_startprob, log_transmat, log_sitewise_transmat):
-    '''
-    Input
-        X: size n_observations * n_components * n_spots.
-        lengths: sum of lengths = n_observations.
-        exposures: size of n_observations * n_spots.
-        base_prob: size of n_observations. The expression probability derived from normal spots.
-        log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
-        alpha: size of n_states. Dispersioon parameter of each HMM state.
-        log_transmat: n_states * n_states. Transition probability after log transformation.
-        log_startprob: n_states. Start probability after log transformation.
-    Output
-#        log_prob: a scalar.
-        labels: size of n_observations.
-    Intermediate
-        log_emission: n_states * n_observations * n_spots. Log probability.
-        log_v: n_states * n_observations per chromosome. Log of viterbi DP table. v[i,t] = max_{q_1, ..., q_{t-1}} P(o_1, q_1, ..., o_{t-1}, q_{t-1}, o_t, q_t=i | lambda).
-    '''
+    log_alpha = hmm_sitewise.forward_lattice(
+        lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+    )
+    return (
+        np.sum(scipy.special.logsumexp(log_alpha[:, np.cumsum(lengths) - 1], axis=0)),
+        log_alpha,
+    )
+
+
+def viterbi_nb_bb_sitewise(
+    X,
+    lengths,
+    base_nb_mean,
+    log_mu,
+    alphas,
+    total_bb_RD,
+    p_binom,
+    taus,
+    log_startprob,
+    log_transmat,
+    log_sitewise_transmat,
+):
+    """
+        Input
+            X: size n_observations * n_components * n_spots.
+            lengths: sum of lengths = n_observations.
+            exposures: size of n_observations * n_spots.
+            base_prob: size of n_observations. The expression probability derived from normal spots.
+            log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
+            alpha: size of n_states. Dispersioon parameter of each HMM state.
+            log_transmat: n_states * n_states. Transition probability after log transformation.
+            log_startprob: n_states. Start probability after log transformation.
+        Output
+    #        log_prob: a scalar.
+            labels: size of n_observations.
+        Intermediate
+            log_emission: n_states * n_observations * n_spots. Log probability.
+            log_v: n_states * n_observations per chromosome. Log of viterbi DP table. v[i,t] = max_{q_1, ..., q_{t-1}} P(o_1, q_1, ..., o_{t-1}, q_{t-1}, o_t, q_t=i | lambda).
+    """
     n_obs = X.shape[0]
     n_comp = X.shape[1]
     n_spots = X.shape[2]
     n_states = log_transmat.shape[0]
     log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat))
-    log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
+    log_emission_rdr, log_emission_baf = (
+        hmm_sitewise.compute_emission_probability_nb_betabinom(
+            X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
+        )
+    )
     log_emission = log_emission_rdr + log_emission_baf
     # initialize viterbi DP table and backtracking table
     labels = np.array([])
     merged_labels = np.array([])
     cumlen = 0
     for le in lengths:
-        log_v = np.zeros((2*n_states, le))
-        bt = np.zeros((2*n_states, le))
+        log_v = np.zeros((2 * n_states, le))
+        bt = np.zeros((2 * n_states, le))
         for t in np.arange(le):
             if cumlen == 0 and t == 0:
-                log_v[:, 0] = np.mean(log_emission[:,0,:], axis=1) + np.append(log_startprob,log_startprob) + np.log(0.5)
+                log_v[:, 0] = (
+                    np.mean(log_emission[:, 0, :], axis=1)
+                    + np.append(log_startprob, log_startprob)
+                    + np.log(0.5)
+                )
                 continue
-            for i in np.arange(2*n_states):
+            for i in np.arange(2 * n_states):
                 if t > 0:
-                    tmp = log_v[:, (t-1)] + np.append(log_transmat[:,i - n_states * int(i/n_states)], log_transmat[:,i - n_states * int(i/n_states)]) + np.sum(log_emission[i, (cumlen+t), :])
+                    tmp = (
+                        log_v[:, (t - 1)]
+                        + np.append(
+                            log_transmat[:, i - n_states * int(i / n_states)],
+                            log_transmat[:, i - n_states * int(i / n_states)],
+                        )
+                        + np.sum(log_emission[i, (cumlen + t), :])
+                    )
                 else:
-                    tmp = np.append(log_startprob[i - n_states * int(i/n_states)], log_startprob[i - n_states * int(i/n_states)]) + np.sum(log_emission[i, (cumlen+t), :])
+                    tmp = np.append(
+                        log_startprob[i - n_states * int(i / n_states)],
+                        log_startprob[i - n_states * int(i / n_states)],
+                    ) + np.sum(log_emission[i, (cumlen + t), :])
                 bt[i, t] = np.argmax(tmp)
                 log_v[i, t] = np.max(tmp)
         # backtracking to get the sequence
-        chr_labels = [ np.argmax(log_v[:,-1]) ]
-        
+        chr_labels = [np.argmax(log_v[:, -1])]
+
         if cumlen == 0:
-            for t2 in np.arange(le-1, 0, -1):
-                chr_labels.append( int(bt[chr_labels[-1],t2]))
+            for t2 in np.arange(le - 1, 0, -1):
+                chr_labels.append(int(bt[chr_labels[-1], t2]))
         else:
-            for t2 in np.arange(le-2, -1, -1):
-                chr_labels.append( int(bt[chr_labels[-1],t2]))
+            for t2 in np.arange(le - 2, -1, -1):
+                chr_labels.append(int(bt[chr_labels[-1], t2]))
 
         chr_labels = np.array(chr_labels[::-1]).astype(int)
         # merge two phases
         chr_merged_labels = copy.copy(chr_labels)
-        chr_merged_labels[chr_merged_labels >= n_states] = chr_merged_labels[chr_merged_labels >= n_states] - n_states
-        
+        chr_merged_labels[chr_merged_labels >= n_states] = (
+            chr_merged_labels[chr_merged_labels >= n_states] - n_states
+        )
+
         if cumlen == 0:
             labels = chr_labels
             merged_labels = chr_merged_labels
         else:
             labels = np.append(labels, chr_labels)
             merged_labels = np.append(merged_labels, chr_merged_labels)
-        
+
         cumlen += le
     return labels, merged_labels
 
 
-def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop=None, \
-    hmmclass=hmm_sitewise, params="smp", t=1-1e-6, random_state=0, \
-    in_log_space=True, only_minor=False, fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \
-    init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, is_diag=True, max_iter=100, tol=1e-4, **kwargs):
+def pipeline_baum_welch(
+    output_prefix,
+    X,
+    lengths,
+    n_states,
+    base_nb_mean,
+    total_bb_RD,
+    log_sitewise_transmat,
+    tumor_prop=None,
+    hmmclass=hmm_sitewise,
+    params="smp",
+    t=1 - 1e-6,
+    random_state=0,
+    in_log_space=True,
+    only_minor=False,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=True,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=True,
+    init_log_mu=None,
+    init_p_binom=None,
+    init_alphas=None,
+    init_taus=None,
+    is_diag=True,
+    max_iter=100,
+    tol=1e-4,
+    **kwargs,
+):
     """
     tumor_prop : array, (n_obs, n_spots)
         Probability of sequencing a tumor read. (tumor cell proportion weighted by ploidy)
@@ -492,15 +817,26 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total
     """
     # initialization
     n_spots = X.shape[2]
-    if ((init_log_mu is None) and ("m" in params)) or ((init_p_binom is None) and ("p" in params)):
-        tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=random_state, in_log_space=in_log_space, only_minor=only_minor)
+    if ((init_log_mu is None) and ("m" in params)) or (
+        (init_p_binom is None) and ("p" in params)
+    ):
+        tmp_log_mu, tmp_p_binom = initialization_by_gmm(
+            n_states,
+            X,
+            base_nb_mean,
+            total_bb_RD,
+            params,
+            random_state=random_state,
+            in_log_space=in_log_space,
+            only_minor=only_minor,
+        )
         if (init_log_mu is None) and ("m" in params):
             init_log_mu = tmp_log_mu
         if (init_p_binom is None) and ("p" in params):
             init_p_binom = tmp_p_binom
     print(f"init_log_mu = {init_log_mu}")
     print(f"init_p_binom = {init_p_binom}")
-    
+
     # fit HMM-NB-BetaBinom
     # new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb(X, lengths, \
     #     n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop, \
@@ -509,34 +845,122 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total
     #     is_diag=is_diag, init_log_mu=init_log_mu, init_p_binom=init_p_binom, init_alphas=init_alphas, init_taus=init_taus, \
     #     max_iter=max_iter, tol=tol)
     hmmmodel = hmmclass(params=params, t=t)
-    remain_kwargs = {k:v for k,v in kwargs.items() if k in ["lambd", "sample_length", "log_gamma"]}
-    new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma = hmmmodel.run_baum_welch_nb_bb(X, lengths, \
-        n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop, \
-        fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
-        fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
-        is_diag=is_diag, init_log_mu=init_log_mu, init_p_binom=init_p_binom, init_alphas=init_alphas, init_taus=init_taus, \
-        max_iter=max_iter, tol=tol, **remain_kwargs)
+    remain_kwargs = {
+        k: v for k, v in kwargs.items() if k in ["lambd", "sample_length", "log_gamma"]
+    }
+    (
+        new_log_mu,
+        new_alphas,
+        new_p_binom,
+        new_taus,
+        new_log_startprob,
+        new_log_transmat,
+        log_gamma,
+    ) = hmmmodel.run_baum_welch_nb_bb(
+        X,
+        lengths,
+        n_states,
+        base_nb_mean,
+        total_bb_RD,
+        log_sitewise_transmat,
+        tumor_prop,
+        fix_NB_dispersion=fix_NB_dispersion,
+        shared_NB_dispersion=shared_NB_dispersion,
+        fix_BB_dispersion=fix_BB_dispersion,
+        shared_BB_dispersion=shared_BB_dispersion,
+        is_diag=is_diag,
+        init_log_mu=init_log_mu,
+        init_p_binom=init_p_binom,
+        init_alphas=init_alphas,
+        init_taus=init_taus,
+        max_iter=max_iter,
+        tol=tol,
+        **remain_kwargs,
+    )
 
     # likelihood
     if tumor_prop is None:
-        log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus)
+        log_emission_rdr, log_emission_baf = (
+            hmmclass.compute_emission_probability_nb_betabinom(
+                X,
+                base_nb_mean,
+                new_log_mu,
+                new_alphas,
+                total_bb_RD,
+                new_p_binom,
+                new_taus,
+            )
+        )
         log_emission = log_emission_rdr + log_emission_baf
     else:
         if ("m" in params) and ("sample_length" in kwargs):
             logmu_shift = []
             for c in range(len(kwargs["sample_length"])):
-                this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states
-                logmu_shift.append( scipy.special.logsumexp(new_log_mu[this_pred_cnv,:] + np.log(kwargs["lambd"]).reshape(-1,1), axis=0) )
+                this_pred_cnv = (
+                    np.argmax(
+                        log_gamma[
+                            :,
+                            np.sum(kwargs["sample_length"][:c]) : np.sum(
+                                kwargs["sample_length"][: (c + 1)]
+                            ),
+                        ],
+                        axis=0,
+                    )
+                    % n_states
+                )
+                logmu_shift.append(
+                    scipy.special.logsumexp(
+                        new_log_mu[this_pred_cnv, :]
+                        + np.log(kwargs["lambd"]).reshape(-1, 1),
+                        axis=0,
+                    )
+                )
             logmu_shift = np.vstack(logmu_shift)
-            log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus, tumor_prop, logmu_shift=logmu_shift, sample_length=kwargs["sample_length"])
+            log_emission_rdr, log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    X,
+                    base_nb_mean,
+                    new_log_mu,
+                    new_alphas,
+                    total_bb_RD,
+                    new_p_binom,
+                    new_taus,
+                    tumor_prop,
+                    logmu_shift=logmu_shift,
+                    sample_length=kwargs["sample_length"],
+                )
+            )
         else:
-            log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus, tumor_prop)
+            log_emission_rdr, log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    X,
+                    base_nb_mean,
+                    new_log_mu,
+                    new_alphas,
+                    total_bb_RD,
+                    new_p_binom,
+                    new_taus,
+                    tumor_prop,
+                )
+            )
         # log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus, tumor_prop)
         log_emission = log_emission_rdr + log_emission_baf
-    log_alpha = hmmclass.forward_lattice(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat)
-    llf = np.sum(scipy.special.logsumexp(log_alpha[:,np.cumsum(lengths)-1], axis=0))
+    log_alpha = hmmclass.forward_lattice(
+        lengths,
+        new_log_transmat,
+        new_log_startprob,
+        log_emission,
+        log_sitewise_transmat,
+    )
+    llf = np.sum(scipy.special.logsumexp(log_alpha[:, np.cumsum(lengths) - 1], axis=0))
 
-    log_beta = hmmclass.backward_lattice(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat)
+    log_beta = hmmclass.backward_lattice(
+        lengths,
+        new_log_transmat,
+        new_log_startprob,
+        log_emission,
+        log_sitewise_transmat,
+    )
     log_gamma = compute_posterior_obs(log_alpha, log_beta)
     pred = np.argmax(log_gamma, axis=0)
     pred_cnv = pred % n_states
@@ -544,22 +968,48 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total
     # save results
     if not output_prefix is None:
         tmp = np.log10(1 - t)
-        np.savez(f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz", \
-                new_log_mu=new_log_mu, new_alphas=new_alphas, new_p_binom=new_p_binom, new_taus=new_taus, \
-                new_log_startprob=new_log_startprob, new_log_transmat=new_log_transmat, log_gamma=log_gamma, pred_cnv=pred_cnv, llf=llf)
+        np.savez(
+            f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz",
+            new_log_mu=new_log_mu,
+            new_alphas=new_alphas,
+            new_p_binom=new_p_binom,
+            new_taus=new_taus,
+            new_log_startprob=new_log_startprob,
+            new_log_transmat=new_log_transmat,
+            log_gamma=log_gamma,
+            pred_cnv=pred_cnv,
+            llf=llf,
+        )
     else:
-        res = {"new_log_mu":new_log_mu, "new_alphas":new_alphas, "new_p_binom":new_p_binom, "new_taus":new_taus, \
-            "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf}
+        res = {
+            "new_log_mu": new_log_mu,
+            "new_alphas": new_alphas,
+            "new_p_binom": new_p_binom,
+            "new_taus": new_taus,
+            "new_log_startprob": new_log_startprob,
+            "new_log_transmat": new_log_transmat,
+            "log_gamma": log_gamma,
+            "pred_cnv": pred_cnv,
+            "llf": llf,
+        }
         return res
 
 
-def eval_neymanpearson_bafonly(log_emission_baf_c1, pred_c1, log_emission_baf_c2, pred_c2, bidx, n_states, res, p):
-    assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states
+def eval_neymanpearson_bafonly(
+    log_emission_baf_c1, pred_c1, log_emission_baf_c2, pred_c2, bidx, n_states, res, p
+):
+    assert (
+        log_emission_baf_c1.shape[0] == n_states
+        or log_emission_baf_c1.shape[0] == 2 * n_states
+    )
     # likelihood under the corresponding state
-    llf_original = np.append(log_emission_baf_c1[pred_c1[bidx], bidx], log_emission_baf_c2[pred_c2[bidx], bidx]).reshape(-1,1)
+    llf_original = np.append(
+        log_emission_baf_c1[pred_c1[bidx], bidx],
+        log_emission_baf_c2[pred_c2[bidx], bidx],
+    ).reshape(-1, 1)
     # likelihood under the switched state
     if log_emission_baf_c1.shape[0] == 2 * n_states:
-        if (res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5):
+        if (res["new_p_binom"][p[0], 0] > 0.5) == (res["new_p_binom"][p[1], 0] > 0.5):
             switch_pred_c1 = n_states * (pred_c1 >= n_states) + (pred_c2 % n_states)
             switch_pred_c2 = n_states * (pred_c2 >= n_states) + (pred_c1 % n_states)
         else:
@@ -568,19 +1018,40 @@ def eval_neymanpearson_bafonly(log_emission_baf_c1, pred_c1, log_emission_baf_c2
     else:
         switch_pred_c1 = pred_c2
         switch_pred_c2 = pred_c1
-    llf_switch = np.append(log_emission_baf_c1[switch_pred_c1[bidx], bidx], log_emission_baf_c2[switch_pred_c2[bidx], bidx]).reshape(-1,1)
+    llf_switch = np.append(
+        log_emission_baf_c1[switch_pred_c1[bidx], bidx],
+        log_emission_baf_c2[switch_pred_c2[bidx], bidx],
+    ).reshape(-1, 1)
     # log likelihood difference
     return np.mean(llf_original) - np.mean(llf_switch)
 
 
-def eval_neymanpearson_rdrbaf(log_emission_rdr_c1, log_emission_baf_c1, pred_c1, log_emission_rdr_c2, log_emission_baf_c2, pred_c2, bidx, n_states, res, p):
-    assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states
+def eval_neymanpearson_rdrbaf(
+    log_emission_rdr_c1,
+    log_emission_baf_c1,
+    pred_c1,
+    log_emission_rdr_c2,
+    log_emission_baf_c2,
+    pred_c2,
+    bidx,
+    n_states,
+    res,
+    p,
+):
+    assert (
+        log_emission_baf_c1.shape[0] == n_states
+        or log_emission_baf_c1.shape[0] == 2 * n_states
+    )
     # likelihood under the corresponding state
-    llf_original = np.append(log_emission_rdr_c1[pred_c1[bidx], bidx] + log_emission_baf_c1[pred_c1[bidx], bidx], \
-        log_emission_rdr_c2[pred_c2[bidx], bidx] + log_emission_baf_c2[pred_c2[bidx], bidx]).reshape(-1,1)
+    llf_original = np.append(
+        log_emission_rdr_c1[pred_c1[bidx], bidx]
+        + log_emission_baf_c1[pred_c1[bidx], bidx],
+        log_emission_rdr_c2[pred_c2[bidx], bidx]
+        + log_emission_baf_c2[pred_c2[bidx], bidx],
+    ).reshape(-1, 1)
     # likelihood under the switched state
     if log_emission_baf_c1.shape[0] == 2 * n_states:
-        if (res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5):
+        if (res["new_p_binom"][p[0], 0] > 0.5) == (res["new_p_binom"][p[1], 0] > 0.5):
             switch_pred_c1 = n_states * (pred_c1 >= n_states) + (pred_c2 % n_states)
             switch_pred_c2 = n_states * (pred_c2 >= n_states) + (pred_c1 % n_states)
         else:
@@ -589,162 +1060,378 @@ def eval_neymanpearson_rdrbaf(log_emission_rdr_c1, log_emission_baf_c1, pred_c1,
     else:
         switch_pred_c1 = pred_c2
         switch_pred_c2 = pred_c1
-    llf_switch = np.append(log_emission_rdr_c1[switch_pred_c1[bidx], bidx] + log_emission_baf_c1[switch_pred_c1[bidx], bidx], \
-        log_emission_rdr_c2[switch_pred_c2[bidx], bidx] + log_emission_baf_c2[switch_pred_c2[bidx], bidx]).reshape(-1,1)
+    llf_switch = np.append(
+        log_emission_rdr_c1[switch_pred_c1[bidx], bidx]
+        + log_emission_baf_c1[switch_pred_c1[bidx], bidx],
+        log_emission_rdr_c2[switch_pred_c2[bidx], bidx]
+        + log_emission_baf_c2[switch_pred_c2[bidx], bidx],
+    ).reshape(-1, 1)
     # log likelihood difference
     return np.mean(llf_original) - np.mean(llf_switch)
 
 
-def compute_neymanpearson_stats(X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass):
+def compute_neymanpearson_stats(
+    X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass
+):
     n_obs = X.shape[0]
     n_states = res["new_p_binom"].shape[0]
     n_clones = X.shape[2]
     lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean)
     #
     if tumor_prop is None:
-        log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-            base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-            total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"])
+        log_emission_rdr, log_emission_baf = (
+            hmmclass.compute_emission_probability_nb_betabinom(
+                np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                    -1, 2, 1
+                ),
+                base_nb_mean.flatten("F").reshape(-1, 1),
+                res["new_log_mu"],
+                res["new_alphas"],
+                total_bb_RD.flatten("F").reshape(-1, 1),
+                res["new_p_binom"],
+                res["new_taus"],
+            )
+        )
     else:
         if "m" in params:
             logmu_shift = []
             for c in range(n_clones):
-                this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states
-                logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) )
+                this_pred_cnv = (
+                    np.argmax(
+                        res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], axis=0
+                    )
+                    % n_states
+                )
+                logmu_shift.append(
+                    scipy.special.logsumexp(
+                        res["new_log_mu"][this_pred_cnv, :]
+                        + np.log(lambd).reshape(-1, 1),
+                        axis=0,
+                    )
+                )
             logmu_shift = np.vstack(logmu_shift)
-            log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-                base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-                total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop, logmu_shift=logmu_shift, sample_length=np.ones(n_clones,dtype=int)*n_obs)
+            log_emission_rdr, log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.vstack(
+                        [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]
+                    ).T.reshape(-1, 2, 1),
+                    base_nb_mean.flatten("F").reshape(-1, 1),
+                    res["new_log_mu"],
+                    res["new_alphas"],
+                    total_bb_RD.flatten("F").reshape(-1, 1),
+                    res["new_p_binom"],
+                    res["new_taus"],
+                    tumor_prop,
+                    logmu_shift=logmu_shift,
+                    sample_length=np.ones(n_clones, dtype=int) * n_obs,
+                )
+            )
         else:
-            log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-                base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-                total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop)
-    log_emission_rdr = log_emission_rdr.reshape((log_emission_rdr.shape[0], n_obs, n_clones), order="F")
-    log_emission_baf = log_emission_baf.reshape((log_emission_baf.shape[0], n_obs, n_clones), order="F")
-    reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1))
+            log_emission_rdr, log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.vstack(
+                        [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]
+                    ).T.reshape(-1, 2, 1),
+                    base_nb_mean.flatten("F").reshape(-1, 1),
+                    res["new_log_mu"],
+                    res["new_alphas"],
+                    total_bb_RD.flatten("F").reshape(-1, 1),
+                    res["new_p_binom"],
+                    res["new_taus"],
+                    tumor_prop,
+                )
+            )
+    log_emission_rdr = log_emission_rdr.reshape(
+        (log_emission_rdr.shape[0], n_obs, n_clones), order="F"
+    )
+    log_emission_baf = log_emission_baf.reshape(
+        (log_emission_baf.shape[0], n_obs, n_clones), order="F"
+    )
+    reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
-    all_test_statistics = {(c1, c2):[] for c1 in range(n_clones) for c2 in range(c1+1, n_clones)}
+    all_test_statistics = {
+        (c1, c2): [] for c1 in range(n_clones) for c2 in range(c1 + 1, n_clones)
+    }
     for c1 in range(n_clones):
-        for c2 in range(c1+1, n_clones):
+        for c2 in range(c1 + 1, n_clones):
             # unmergeable_bincount = 0
-            unique_pair_states = [x for x in np.unique(reshaped_pred_cnv[np.array([c1,c2]), :], axis=1).T if x[0] != x[1]]
+            unique_pair_states = [
+                x
+                for x in np.unique(reshaped_pred_cnv[np.array([c1, c2]), :], axis=1).T
+                if x[0] != x[1]
+            ]
             list_t_neymanpearson = []
             for p in unique_pair_states:
-                bidx = np.where( (reshaped_pred_cnv[c1,:]==p[0]) & (reshaped_pred_cnv[c2,:]==p[1]) )[0]
+                bidx = np.where(
+                    (reshaped_pred_cnv[c1, :] == p[0])
+                    & (reshaped_pred_cnv[c2, :] == p[1])
+                )[0]
                 if "m" in params and "p" in params:
-                    t_neymanpearson = eval_neymanpearson_rdrbaf(log_emission_rdr[:,:,c1], log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_rdr[:,:,c2], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p)
+                    t_neymanpearson = eval_neymanpearson_rdrbaf(
+                        log_emission_rdr[:, :, c1],
+                        log_emission_baf[:, :, c1],
+                        reshaped_pred[c1, :],
+                        log_emission_rdr[:, :, c2],
+                        log_emission_baf[:, :, c2],
+                        reshaped_pred[c2, :],
+                        bidx,
+                        n_states,
+                        res,
+                        p,
+                    )
                 elif "p" in params:
-                    t_neymanpearson = eval_neymanpearson_bafonly(log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p)
-                all_test_statistics[(c1, c2)].append( (p[0], p[1], t_neymanpearson) )
-    
+                    t_neymanpearson = eval_neymanpearson_bafonly(
+                        log_emission_baf[:, :, c1],
+                        reshaped_pred[c1, :],
+                        log_emission_baf[:, :, c2],
+                        reshaped_pred[c2, :],
+                        bidx,
+                        n_states,
+                        res,
+                        p,
+                    )
+                all_test_statistics[(c1, c2)].append((p[0], p[1], t_neymanpearson))
+
     return all_test_statistics
 
 
-def similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=2.0, minlength=10, topk=10, params="smp", tumor_prop=None, hmmclass=hmm_sitewise, **kwargs):
+def similarity_components_rdrbaf_neymanpearson(
+    X,
+    base_nb_mean,
+    total_bb_RD,
+    res,
+    threshold=2.0,
+    minlength=10,
+    topk=10,
+    params="smp",
+    tumor_prop=None,
+    hmmclass=hmm_sitewise,
+    **kwargs,
+):
     n_obs = X.shape[0]
     n_states = res["new_p_binom"].shape[0]
     n_clones = X.shape[2]
     G = nx.Graph()
-    G.add_nodes_from( np.arange(n_clones) )
+    G.add_nodes_from(np.arange(n_clones))
     #
     lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean)
     #
     if tumor_prop is None:
-        log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-            base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-            total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"])
+        log_emission_rdr, log_emission_baf = (
+            hmmclass.compute_emission_probability_nb_betabinom(
+                np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                    -1, 2, 1
+                ),
+                base_nb_mean.flatten("F").reshape(-1, 1),
+                res["new_log_mu"],
+                res["new_alphas"],
+                total_bb_RD.flatten("F").reshape(-1, 1),
+                res["new_p_binom"],
+                res["new_taus"],
+            )
+        )
     else:
         if "m" in params:
             logmu_shift = []
             for c in range(n_clones):
-                this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states
-                logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) )
+                this_pred_cnv = (
+                    np.argmax(
+                        res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], axis=0
+                    )
+                    % n_states
+                )
+                logmu_shift.append(
+                    scipy.special.logsumexp(
+                        res["new_log_mu"][this_pred_cnv, :]
+                        + np.log(lambd).reshape(-1, 1),
+                        axis=0,
+                    )
+                )
             logmu_shift = np.vstack(logmu_shift)
-            log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-                base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-                total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop, logmu_shift=logmu_shift, sample_length=np.ones(n_clones,dtype=int)*n_obs)
+            log_emission_rdr, log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.vstack(
+                        [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]
+                    ).T.reshape(-1, 2, 1),
+                    base_nb_mean.flatten("F").reshape(-1, 1),
+                    res["new_log_mu"],
+                    res["new_alphas"],
+                    total_bb_RD.flatten("F").reshape(-1, 1),
+                    res["new_p_binom"],
+                    res["new_taus"],
+                    tumor_prop,
+                    logmu_shift=logmu_shift,
+                    sample_length=np.ones(n_clones, dtype=int) * n_obs,
+                )
+            )
         else:
-            log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-                base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-                total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop)
-    log_emission_rdr = log_emission_rdr.reshape((log_emission_rdr.shape[0], n_obs, n_clones), order="F")
-    log_emission_baf = log_emission_baf.reshape((log_emission_baf.shape[0], n_obs, n_clones), order="F")
-    reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1))
+            log_emission_rdr, log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.vstack(
+                        [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]
+                    ).T.reshape(-1, 2, 1),
+                    base_nb_mean.flatten("F").reshape(-1, 1),
+                    res["new_log_mu"],
+                    res["new_alphas"],
+                    total_bb_RD.flatten("F").reshape(-1, 1),
+                    res["new_p_binom"],
+                    res["new_taus"],
+                    tumor_prop,
+                )
+            )
+    log_emission_rdr = log_emission_rdr.reshape(
+        (log_emission_rdr.shape[0], n_obs, n_clones), order="F"
+    )
+    log_emission_baf = log_emission_baf.reshape(
+        (log_emission_baf.shape[0], n_obs, n_clones), order="F"
+    )
+    reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
     all_test_statistics = []
     for c1 in range(n_clones):
-        for c2 in range(c1+1, n_clones):
+        for c2 in range(c1 + 1, n_clones):
             # unmergeable_bincount = 0
-            unique_pair_states = [x for x in np.unique(reshaped_pred_cnv[np.array([c1,c2]), :], axis=1).T if x[0] != x[1]]
+            unique_pair_states = [
+                x
+                for x in np.unique(reshaped_pred_cnv[np.array([c1, c2]), :], axis=1).T
+                if x[0] != x[1]
+            ]
             list_t_neymanpearson = []
             for p in unique_pair_states:
-                bidx = np.where( (reshaped_pred_cnv[c1,:]==p[0]) & (reshaped_pred_cnv[c2,:]==p[1]) )[0]
+                bidx = np.where(
+                    (reshaped_pred_cnv[c1, :] == p[0])
+                    & (reshaped_pred_cnv[c2, :] == p[1])
+                )[0]
                 if "m" in params and "p" in params:
-                    t_neymanpearson = eval_neymanpearson_rdrbaf(log_emission_rdr[:,:,c1], log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_rdr[:,:,c2], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p)
+                    t_neymanpearson = eval_neymanpearson_rdrbaf(
+                        log_emission_rdr[:, :, c1],
+                        log_emission_baf[:, :, c1],
+                        reshaped_pred[c1, :],
+                        log_emission_rdr[:, :, c2],
+                        log_emission_baf[:, :, c2],
+                        reshaped_pred[c2, :],
+                        bidx,
+                        n_states,
+                        res,
+                        p,
+                    )
                 elif "p" in params:
-                    t_neymanpearson = eval_neymanpearson_bafonly(log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p)
+                    t_neymanpearson = eval_neymanpearson_bafonly(
+                        log_emission_baf[:, :, c1],
+                        reshaped_pred[c1, :],
+                        log_emission_baf[:, :, c2],
+                        reshaped_pred[c2, :],
+                        bidx,
+                        n_states,
+                        res,
+                        p,
+                    )
                 print(c1, c2, p, len(bidx), t_neymanpearson)
-                all_test_statistics.append( [c1, c2, p, t_neymanpearson] )
+                all_test_statistics.append([c1, c2, p, t_neymanpearson])
                 if len(bidx) >= minlength:
                     list_t_neymanpearson.append(t_neymanpearson)
-            if len(list_t_neymanpearson) == 0 or np.max(list_t_neymanpearson) < threshold:
-                max_v = np.max(list_t_neymanpearson) if len(list_t_neymanpearson) > 0 else 1e-3
-                G.add_weighted_edges_from([ (c1, c2, max_v) ])                
+            if (
+                len(list_t_neymanpearson) == 0
+                or np.max(list_t_neymanpearson) < threshold
+            ):
+                max_v = (
+                    np.max(list_t_neymanpearson)
+                    if len(list_t_neymanpearson) > 0
+                    else 1e-3
+                )
+                G.add_weighted_edges_from([(c1, c2, max_v)])
     # maximal cliques
     cliques = []
     for x in nx.find_cliques(G):
         this_len = len(x)
-        this_weights = np.sum([G.get_edge_data(a,b)["weight"] for a in x for b in x if a != b]) / 2
-        cliques.append( (x, this_len, this_weights) )
-    cliques.sort(key = lambda x:(-x[1],x[2]) )
+        this_weights = (
+            np.sum([G.get_edge_data(a, b)["weight"] for a in x for b in x if a != b])
+            / 2
+        )
+        cliques.append((x, this_len, this_weights))
+    cliques.sort(key=lambda x: (-x[1], x[2]))
     covered_nodes = set()
     merging_groups = []
     for c in cliques:
         if len(set(c[0]) & covered_nodes) == 0:
-            merging_groups.append( list(c[0]) )
+            merging_groups.append(list(c[0]))
             covered_nodes = covered_nodes | set(c[0])
     for c in range(n_clones):
         if not (c in covered_nodes):
-            merging_groups.append( [c] )
+            merging_groups.append([c])
             covered_nodes.add(c)
-    merging_groups.sort(key = lambda x:np.min(x))
+    merging_groups.sort(key=lambda x: np.min(x))
     # clone assignment after merging
     map_clone_id = {}
-    for i,x in enumerate(merging_groups):
+    for i, x in enumerate(merging_groups):
         for z in x:
             map_clone_id[z] = i
     new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
     merged_res = copy.copy(res)
     merged_res["new_assignment"] = new_assignment
     merged_res["total_llf"] = np.NAN
-    merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
-    merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
+    merged_res["pred_cnv"] = np.concatenate(
+        [
+            res["pred_cnv"][(c[0] * n_obs) : (c[0] * n_obs + n_obs)]
+            for c in merging_groups
+        ]
+    )
+    merged_res["log_gamma"] = np.hstack(
+        [
+            res["log_gamma"][:, (c[0] * n_obs) : (c[0] * n_obs + n_obs)]
+            for c in merging_groups
+        ]
+    )
     return merging_groups, merged_res
 
 
-def combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, res, params="smp", tumor_prop=None, hmmclass=hmm_sitewise, merge_threshold=0.1, **kwargs):
+def combine_similar_states_across_clones(
+    X,
+    base_nb_mean,
+    total_bb_RD,
+    res,
+    params="smp",
+    tumor_prop=None,
+    hmmclass=hmm_sitewise,
+    merge_threshold=0.1,
+    **kwargs,
+):
     n_clones = X.shape[2]
     n_obs = X.shape[0]
     n_states = res["new_p_binom"].shape[0]
-    reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1))
+    reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
     #
-    all_test_statistics = compute_neymanpearson_stats(X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass)
+    all_test_statistics = compute_neymanpearson_stats(
+        X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass
+    )
     # make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small
     for c1 in range(n_clones):
-        for c2 in range(c1+1, n_clones):
+        for c2 in range(c1 + 1, n_clones):
             list_t_neymanpearson = all_test_statistics[(c1, c2)]
             for p1, p2, t_neymanpearson in list_t_neymanpearson:
                 if t_neymanpearson < merge_threshold:
-                    c_keep = c1 if np.sum(total_bb_RD[:,c1]) > np.sum(total_bb_RD[:,c2]) else c2
+                    c_keep = (
+                        c1
+                        if np.sum(total_bb_RD[:, c1]) > np.sum(total_bb_RD[:, c2])
+                        else c2
+                    )
                     c_change = c2 if c_keep == c1 else c1
-                    bidx = np.where( (reshaped_pred_cnv[c1,:]==p1) & (reshaped_pred_cnv[c2,:]==p2) )[0]
-                    res['pred_cnv'][(c_change*n_obs):(c_change*n_obs+n_obs)][bidx] = res['pred_cnv'][(c_keep*n_obs):(c_keep*n_obs+n_obs)][bidx]
-                    print(f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}")
+                    bidx = np.where(
+                        (reshaped_pred_cnv[c1, :] == p1)
+                        & (reshaped_pred_cnv[c2, :] == p2)
+                    )[0]
+                    res["pred_cnv"][(c_change * n_obs) : (c_change * n_obs + n_obs)][
+                        bidx
+                    ] = res["pred_cnv"][(c_keep * n_obs) : (c_keep * n_obs + n_obs)][
+                        bidx
+                    ]
+                    print(
+                        f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}"
+                    )
     return res
 
 
-
 # def similarity_components_rdrbaf_neymanpearson_posterior(X, base_nb_mean, total_bb_RD, res, threshold=2.0, minlength=10, topk=10, params="smp", tumor_prop=None, hmmclass=hmm_sitewise):
 #     n_obs = X.shape[0]
 #     n_states = res["new_p_binom"].shape[0]
@@ -755,7 +1442,7 @@ def combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, res, para
 #     def eval_neymanpearson_bafonly(log_emission_baf_c1, log_gamma_c1, log_emission_baf_c2, log_gamma_c2, bidx, n_states, res, p):
 #         assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states
 #         # likelihood under the corresponding state
-#         llf_original = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + log_gamma_c1[:, bidx], axis=0), 
+#         llf_original = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + log_gamma_c1[:, bidx], axis=0),
 #                                  scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + log_gamma_c2[:, bidx], axis=0))
 #         # likelihood under the switched state
 #         if log_emission_baf_c1.shape[0] == 2 * n_states:
@@ -773,7 +1460,7 @@ def combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, res, para
 #         else:
 #             switch_log_gamma_c1 = log_gamma_c2
 #             switch_log_gamma_c2 = log_gamma_c1
-#         llf_switch = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0), 
+#         llf_switch = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0),
 #                                scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + switch_log_gamma_c2[:, bidx], axis=0))
 #         # log likelihood difference
 #         return np.mean(llf_original) - np.mean(llf_switch)
diff --git a/src/calicost/hmm_NB_sharedstates.py b/src/calicost/hmm_NB_sharedstates.py
index a265810..8722ef3 100644
--- a/src/calicost/hmm_NB_sharedstates.py
+++ b/src/calicost/hmm_NB_sharedstates.py
@@ -20,8 +20,8 @@ def convert_params(mean, std):
 
     See https://mathworld.wolfram.com/NegativeBinomialDistribution.html
     """
-    p = mean/std**2
-    n = mean*p/(1.0 - p)
+    p = mean / std**2
+    n = mean * p / (1.0 - p)
     return n, p
 
 
@@ -31,6 +31,7 @@ def __init__(self, endog, exog, weights, exposure, seed=0, **kwds):
         self.weights = weights
         self.exposure = exposure
         self.seed = seed
+
     #
     def nloglikeobs(self, params):
         nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
@@ -39,19 +40,20 @@ def nloglikeobs(self, params):
         llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
-        self.exog_names.append('alpha')
+        self.exog_names.append("alpha")
 
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
-        
-        return super(Weighted_NegativeBinomial, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+
+        return super(Weighted_NegativeBinomial, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class ConstrainedNBHMM(BaseHMM):
@@ -84,18 +86,34 @@ class ConstrainedNBHMM(BaseHMM):
     hmmmodel.fit( X )
     hmmmodel.predict( X )
     """
-    def __init__(self, n_components=1, shared_dispersion=False,
-                 startprob_prior=1.0, transmat_prior=1.0,
-                 algorithm="viterbi", random_state=None,
-                 n_iter=10, tol=1e-2, verbose=False,
-                 params="stma",
-                 init_params=""):
-        BaseHMM.__init__(self, n_components,
-                         startprob_prior=startprob_prior,
-                         transmat_prior=transmat_prior, algorithm=algorithm,
-                         random_state=random_state, n_iter=n_iter,
-                         tol=tol, params=params, verbose=verbose,
-                         init_params=init_params)
+
+    def __init__(
+        self,
+        n_components=1,
+        shared_dispersion=False,
+        startprob_prior=1.0,
+        transmat_prior=1.0,
+        algorithm="viterbi",
+        random_state=None,
+        n_iter=10,
+        tol=1e-2,
+        verbose=False,
+        params="stma",
+        init_params="",
+    ):
+        BaseHMM.__init__(
+            self,
+            n_components,
+            startprob_prior=startprob_prior,
+            transmat_prior=transmat_prior,
+            algorithm=algorithm,
+            random_state=random_state,
+            n_iter=n_iter,
+            tol=tol,
+            params=params,
+            verbose=verbose,
+            init_params=init_params,
+        )
         self.shared_dispersion = shared_dispersion
         # initialize CNV's effect
         self.log_mu = np.linspace(-0.1, 0.1, self.n_components)
@@ -105,8 +123,13 @@ def __init__(self, n_components=1, shared_dispersion=False,
         # initialize start probability and transition probability
         self.startprob_ = np.ones(self.n_components) / self.n_components
         t = 0.9
-        self.transmat_ = np.ones((self.n_components, self.n_components)) * (1-t) / (self.n_components-1)
+        self.transmat_ = (
+            np.ones((self.n_components, self.n_components))
+            * (1 - t)
+            / (self.n_components - 1)
+        )
         np.fill_diagonal(self.transmat_, t)
+
     #
     def _compute_log_likelihood(self, X):
         """
@@ -131,16 +154,21 @@ def _compute_log_likelihood(self, X):
             nb_std = np.sqrt(nb_mean + self.alphas[i] * nb_mean**2)
             # nb_std = np.sqrt(nb_mean + self.alphas[i,:].reshape(-1,1) * nb_mean**2)
             n, p = convert_params(nb_mean, nb_std)
-            log_prob[:,:,i] = scipy.stats.nbinom.logpmf(X[:, :n_cells], n, p)
+            log_prob[:, :, i] = scipy.stats.nbinom.logpmf(X[:, :n_cells], n, p)
         return log_prob.mean(axis=1)
+
     #
     def _initialize_sufficient_statistics(self):
         stats = super()._initialize_sufficient_statistics()
         return stats
+
     #
-    def _accumulate_sufficient_statistics(self, stats, X, lattice, posteriors, fwdlattice, bwdlattice):
+    def _accumulate_sufficient_statistics(
+        self, stats, X, lattice, posteriors, fwdlattice, bwdlattice
+    ):
         super()._accumulate_sufficient_statistics(
-            stats, X, lattice, posteriors, fwdlattice, bwdlattice)
+            stats, X, lattice, posteriors, fwdlattice, bwdlattice
+        )
         """
         Update sufficient statistics from a given sample.
         Parameters
@@ -160,62 +188,85 @@ def _accumulate_sufficient_statistics(self, stats, X, lattice, posteriors, fwdla
         fwdlattice, bwdlattice : array, shape (n_genes, n_components)
             forward and backward probabilities.
         """
-        if 'm' in self.params or 'a' in self.params:
-            stats['post'] = posteriors
-            stats['obs'] = X
-        if 't' in self.params:            
+        if "m" in self.params or "a" in self.params:
+            stats["post"] = posteriors
+            stats["obs"] = X
+        if "t" in self.params:
             # for each ij, recover sum_t xi_ij from the inferred transition matrix
             bothlattice = fwdlattice + bwdlattice
-            loggamma = (bothlattice.T - logsumexp(bothlattice, axis = 1)).T
+            loggamma = (bothlattice.T - logsumexp(bothlattice, axis=1)).T
 
             # denominator for each ij is the sum of gammas over i
-            denoms = np.sum(np.exp(loggamma), axis = 0)   
+            denoms = np.sum(np.exp(loggamma), axis=0)
             # transpose to perform row-wise multiplication
-            stats['denoms'] = denoms
+            stats["denoms"] = denoms
+
     #
     def _do_mstep(self, stats):
-        n_genes = stats['obs'].shape[0]
-        n_cells = int(stats['obs'].shape[1] / 2)
-        base_nb_mean = stats['obs'][:, n_cells:]
+        n_genes = stats["obs"].shape[0]
+        n_cells = int(stats["obs"].shape[1] / 2)
+        base_nb_mean = stats["obs"][:, n_cells:]
         super()._do_mstep(stats)
-        if 'm' in self.params and 'a' in self.params:
+        if "m" in self.params and "a" in self.params:
             # NB regression fit dispersion and CNV's effect simultaneously
             if not self.shared_dispersion:
                 for i in range(self.n_components):
-                    model = Weighted_NegativeBinomial(stats['obs'][:, :n_cells].flatten(), \
-                                np.ones(n_genes*n_cells).reshape(-1,1), \
-                                weights=np.repeat(stats['post'][:,i], n_cells), exposure=base_nb_mean.flatten())
+                    model = Weighted_NegativeBinomial(
+                        stats["obs"][:, :n_cells].flatten(),
+                        np.ones(n_genes * n_cells).reshape(-1, 1),
+                        weights=np.repeat(stats["post"][:, i], n_cells),
+                        exposure=base_nb_mean.flatten(),
+                    )
                     res = model.fit(disp=0, maxiter=500)
                     self.log_mu[i] = res.params[0]
                     self.alphas[i] = res.params[-1]
                     # self.alphas[i,:] = res.params[-1]
             else:
                 all_states_nb_mean = np.tile(base_nb_mean.flatten(), self.n_components)
-                all_states_y = np.tile(stats['obs'][:, :n_cells].flatten(), self.n_components)
-                all_states_weights = np.concatenate([np.repeat(stats['post'][:,i], n_cells) for i in range(self.n_components)])
-                all_states_features = np.zeros((self.n_components*n_genes*n_cells, self.n_components))
+                all_states_y = np.tile(
+                    stats["obs"][:, :n_cells].flatten(), self.n_components
+                )
+                all_states_weights = np.concatenate(
+                    [
+                        np.repeat(stats["post"][:, i], n_cells)
+                        for i in range(self.n_components)
+                    ]
+                )
+                all_states_features = np.zeros(
+                    (self.n_components * n_genes * n_cells, self.n_components)
+                )
                 for i in np.arange(self.n_components):
-                    all_states_features[(i*n_genes*n_cells):((i+1)*n_genes*n_cells), i] = 1
-                model = Weighted_NegativeBinomial(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_nb_mean)
+                    all_states_features[
+                        (i * n_genes * n_cells) : ((i + 1) * n_genes * n_cells), i
+                    ] = 1
+                model = Weighted_NegativeBinomial(
+                    all_states_y,
+                    all_states_features,
+                    weights=all_states_weights,
+                    exposure=all_states_nb_mean,
+                )
                 res = model.fit(disp=0, maxiter=500)
                 self.log_mu = res.params[:-1]
                 self.alphas[:] = res.params[-1]
                 # self.alphas[:,:] = res.params[-1]
                 # print(res.params)
-        elif 'm' in self.params:
+        elif "m" in self.params:
             # NB regression fit CNV's effect only
             for i in range(self.n_components):
-                model = sm.GLM(stats['obs'].flatten(), np.ones(self.n_genes*self.n_cells).reshape(-1,1), \
-                            family=sm.families.NegativeBinomial(alpha=self.alphas[i]), \
-                            exposure=base_nb_mean.flatten())
+                model = sm.GLM(
+                    stats["obs"].flatten(),
+                    np.ones(self.n_genes * self.n_cells).reshape(-1, 1),
+                    family=sm.families.NegativeBinomial(alpha=self.alphas[i]),
+                    exposure=base_nb_mean.flatten(),
+                )
                 # model = sm.GLM(stats['obs'][:, :n_cells].flatten(), np.ones(n_genes*n_cells).reshape(-1,1), \
                 #             family=sm.families.NegativeBinomial(alpha=np.repeat(self.alphas[i], n_cells)), \
                 #             exposure=base_nb_mean.flatten(), var_weights=np.repeat(stats['post'][:,i], n_cells))
                 res = model.fit(disp=0, maxiter=500)
                 self.log_mu[i] = res.params[0]
-        if 't' in self.params:
+        if "t" in self.params:
             # following copied from Matt's code
-            denoms = stats['denoms']
+            denoms = stats["denoms"]
             x = (self.transmat_.T * denoms).T
 
             # numerator is the sum of ii elements
@@ -224,17 +275,18 @@ def _do_mstep(self, stats):
             denom = np.sum(x)
 
             # (this is the same as sum_i gamma_i)
-            #assert np.isclose(denom, np.sum(denoms))
+            # assert np.isclose(denom, np.sum(denoms))
+
+            stats["diag"] = num / denom
+            self.transmat_ = self.form_transition_matrix(stats["diag"])
 
-            stats['diag'] = num / denom
-            self.transmat_ = self.form_transition_matrix(stats['diag'])
     #
     def form_transition_matrix(self, diag):
         tol = 1e-10
         diag = np.clip(diag, tol, 1 - tol)
-        
+
         offdiag = (1 - diag) / (self.n_components - 1)
-        transmat_ = np.diag([diag - offdiag] * self.n_components) 
+        transmat_ = np.diag([diag - offdiag] * self.n_components)
         transmat_ += offdiag
-        #assert np.all(transmat_ > 0), (diag, offdiag, transmat_)
-        return transmat_
\ No newline at end of file
+        # assert np.all(transmat_ > 0), (diag, offdiag, transmat_)
+        return transmat_
diff --git a/src/calicost/hmm_gaussian.py b/src/calicost/hmm_gaussian.py
index b053610..0570a8f 100644
--- a/src/calicost/hmm_gaussian.py
+++ b/src/calicost/hmm_gaussian.py
@@ -14,6 +14,7 @@
 # E step related
 ############################################################
 
+
 @njit
 def np_max_ax_squeeze(arr, axis=0):
     assert arr.ndim == 2
@@ -34,11 +35,11 @@ def np_max_ax_keep(arr, axis=0):
     assert arr.ndim == 2
     assert axis in [0, 1]
     if axis == 0:
-        result = np.zeros( (1, arr.shape[1]) )
+        result = np.zeros((1, arr.shape[1]))
         for i in range(result.shape[1]):
             result[:, i] = np.max(arr[:, i])
     else:
-        result = np.zeros( (arr.shape[0], 1) )
+        result = np.zeros((arr.shape[0], 1))
         for i in range(result.shape[0]):
             result[i, :] = np.max(arr[i, :])
     return result
@@ -64,11 +65,11 @@ def np_sum_ax_keep(arr, axis=0):
     assert arr.ndim == 2
     assert axis in [0, 1]
     if axis == 0:
-        result = np.zeros( (1, arr.shape[1]) )
+        result = np.zeros((1, arr.shape[1]))
         for i in range(result.shape[1]):
             result[:, i] = np.sum(arr[:, i])
     else:
-        result = np.zeros( (arr.shape[0], 1) )
+        result = np.zeros((arr.shape[0], 1))
         for i in range(result.shape[0]):
             result[i, :] = np.sum(arr[i, :])
     return result
@@ -88,26 +89,27 @@ def np_mean_ax_squeeze(arr, axis=0):
             result[i] = np.mean(arr[i, :])
     return result
 
+
 @njit
 def np_mean_ax_keep(arr, axis=0):
     assert arr.ndim == 2
     assert axis in [0, 1]
     if axis == 0:
-        result = np.zeros( (1, arr.shape[1]) )
+        result = np.zeros((1, arr.shape[1]))
         for i in range(result.shape[1]):
             result[:, i] = np.mean(arr[:, i])
     else:
-        result = np.zeros( (arr.shape[0], 1) )
+        result = np.zeros((arr.shape[0], 1))
         for i in range(result.shape[0]):
             result[i, :] = np.mean(arr[i, :])
     return result
 
 
-@njit 
+@njit
 def mylogsumexp(a):
     # get max
     a_max = np.max(a)
-    if (np.isinf(a_max)):
+    if np.isinf(a_max):
         return a_max
     # exponential
     tmp = np.exp(a - a_max)
@@ -117,7 +119,7 @@ def mylogsumexp(a):
     return s + a_max
 
 
-@njit 
+@njit
 def mylogsumexp_ax_keep(a, axis):
     # get max
     a_max = np_max_ax_keep(a, axis=axis)
@@ -133,7 +135,6 @@ def mylogsumexp_ax_keep(a, axis):
     return s + a_max
 
 
-
 def compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std):
     """
     Attributes
@@ -158,7 +159,7 @@ def compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std):
 
     p_std : array, shape (n_states, n_spots)
         Over-dispersion of Beta Binomial distribution in HMM per state per spot.
-    
+
     Returns
     ----------
     log_emission : array, shape (2*n_states, n_obs, n_spots)
@@ -173,19 +174,27 @@ def compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std):
     for i in np.arange(n_states):
         for s in np.arange(n_spots):
             # expression from Gaussian distribution
-            if np.any(X[:,0,s] > 0):
-                log_emission[i, :, s] = scipy.stats.norm.logpdf(X[:, 0, s], loc=rdr_mean[i,s], scale=rdr_std[i,s])
+            if np.any(X[:, 0, s] > 0):
+                log_emission[i, :, s] = scipy.stats.norm.logpdf(
+                    X[:, 0, s], loc=rdr_mean[i, s], scale=rdr_std[i, s]
+                )
                 log_emission[i + n_states, :, s] = log_emission[i, :, s]
             # BAF from Gaussian distribution
-            if np.any(X[:,1,s] > 0):
-                log_emission[i, :, s] += scipy.stats.norm.logpdf(X[:,1,s], loc=p_mean[i, s], scale=p_std[i,s])
-                log_emission[i + n_states, :, s] += scipy.stats.norm.logpdf(X[:,1,s], loc=1-p_mean[i, s], scale=p_std[i,s])
+            if np.any(X[:, 1, s] > 0):
+                log_emission[i, :, s] += scipy.stats.norm.logpdf(
+                    X[:, 1, s], loc=p_mean[i, s], scale=p_std[i, s]
+                )
+                log_emission[i + n_states, :, s] += scipy.stats.norm.logpdf(
+                    X[:, 1, s], loc=1 - p_mean[i, s], scale=p_std[i, s]
+                )
     return log_emission
 
 
-@njit 
-def forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-    '''
+@njit
+def forward_lattice_sitewise(
+    lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+):
+    """
     Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs.
     Input
         lengths: sum of lengths = n_observations.
@@ -195,11 +204,15 @@ def forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission,
         log_sitewise_transmat: n_observations, the log transition probability of phase switch.
     Output
         log_alpha: size 2n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda).
-    '''
+    """
     n_obs = log_emission.shape[1]
     n_states = int(np.ceil(log_emission.shape[0] / 2))
-    assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-    assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+    assert (
+        np.sum(lengths) == n_obs
+    ), "Sum of lengths must be equal to the first dimension of X!"
+    assert (
+        len(log_startprob) == n_states
+    ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
     log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat))
     # initialize log_alpha
     log_alpha = np.zeros((log_emission.shape[0], n_obs))
@@ -207,25 +220,42 @@ def forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission,
     cumlen = 0
     for le in lengths:
         # start prob
-        combined_log_startprob = np.log(0.5) + np.append(log_startprob,log_startprob)
-        # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+        combined_log_startprob = np.log(0.5) + np.append(log_startprob, log_startprob)
+        # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
         # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-        log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1)
+        log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze(
+            log_emission[:, cumlen, :], axis=1
+        )
         for t in np.arange(1, le):
-            phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t-1], log_sitewise_transmat[cumlen + t-1]], [log_sitewise_transmat[cumlen + t-1], log_sitewise_self_transmat[cumlen + t-1] ]])
-            combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) )
+            phases_switch_mat = np.array(
+                [
+                    [
+                        log_sitewise_self_transmat[cumlen + t - 1],
+                        log_sitewise_transmat[cumlen + t - 1],
+                    ],
+                    [
+                        log_sitewise_transmat[cumlen + t - 1],
+                        log_sitewise_self_transmat[cumlen + t - 1],
+                    ],
+                ]
+            )
+            combined_transmat = np.kron(np.exp(phases_switch_mat), np.exp(log_transmat))
             combined_transmat = np.log(combined_transmat)
             for j in np.arange(log_emission.shape[0]):
                 for i in np.arange(log_emission.shape[0]):
                     buf[i] = log_alpha[i, (cumlen + t - 1)] + combined_transmat[i, j]
-                log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :])
+                log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(
+                    log_emission[j, (cumlen + t), :]
+                )
         cumlen += le
     return log_alpha
 
 
-@njit 
-def backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
-    '''
+@njit
+def backward_lattice_sitewise(
+    lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat
+):
+    """
     Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs.
     Input
         X: size n_observations * n_components * n_spots.
@@ -236,11 +266,15 @@ def backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission
         log_sitewise_transmat: n_observations, the log transition probability of phase switch.
     Output
         log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda).
-    '''
+    """
     n_obs = log_emission.shape[1]
     n_states = int(np.ceil(log_emission.shape[0] / 2))
-    assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
-    assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
+    assert (
+        np.sum(lengths) == n_obs
+    ), "Sum of lengths must be equal to the first dimension of X!"
+    assert (
+        len(log_startprob) == n_states
+    ), "Length of startprob_ must be equal to the first dimension of log_transmat!"
     log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat))
     # initialize log_beta
     log_beta = np.zeros((log_emission.shape[0], n_obs))
@@ -248,29 +282,44 @@ def backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission
     cumlen = 0
     for le in lengths:
         # start prob
-        # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+        # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
         # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
         log_beta[:, (cumlen + le - 1)] = 0
-        for t in np.arange(le-2, -1, -1):
-            phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t], log_sitewise_transmat[cumlen + t]], [log_sitewise_transmat[cumlen + t], log_sitewise_self_transmat[cumlen + t] ]])
-            combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) )
+        for t in np.arange(le - 2, -1, -1):
+            phases_switch_mat = np.array(
+                [
+                    [
+                        log_sitewise_self_transmat[cumlen + t],
+                        log_sitewise_transmat[cumlen + t],
+                    ],
+                    [
+                        log_sitewise_transmat[cumlen + t],
+                        log_sitewise_self_transmat[cumlen + t],
+                    ],
+                ]
+            )
+            combined_transmat = np.kron(np.exp(phases_switch_mat), np.exp(log_transmat))
             combined_transmat = np.log(combined_transmat)
             for i in np.arange(log_emission.shape[0]):
                 for j in np.arange(log_emission.shape[0]):
-                    buf[j] = log_beta[j, (cumlen + t + 1)] + combined_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :])
+                    buf[j] = (
+                        log_beta[j, (cumlen + t + 1)]
+                        + combined_transmat[i, j]
+                        + np.sum(log_emission[j, (cumlen + t + 1), :])
+                    )
                 log_beta[i, (cumlen + t)] = mylogsumexp(buf)
         cumlen += le
     return log_beta
 
 
 def compute_posterior_obs(log_alpha, log_beta):
-    '''
+    """
     Input
         log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda).
         log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda).
     Output:
         log_gamma: size n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). gamma[i, t] propto alpha[i,t] * beta[i,t]
-    '''
+    """
     n_states = log_alpha.shape[0]
     n_obs = log_alpha.shape[1]
     # initial log_gamma
@@ -280,15 +329,17 @@ def compute_posterior_obs(log_alpha, log_beta):
     #     for t in np.arange(n_obs):
     #         log_gamma[j, t] = log_alpha[j, t] +  log_beta[j, t]
     log_gamma = log_alpha + log_beta
-    if np.any( np.sum(log_gamma, axis=0) == 0 ):
+    if np.any(np.sum(log_gamma, axis=0) == 0):
         raise Exception("Sum of posterior probability is zero for some observations!")
     log_gamma -= scipy.special.logsumexp(log_gamma, axis=0)
     return log_gamma
 
 
 @njit
-def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission):
-    '''
+def compute_posterior_transition_sitewise(
+    log_alpha, log_beta, log_transmat, log_emission
+):
+    """
     Input
         log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda).
         log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda).
@@ -296,20 +347,28 @@ def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log
         log_emission: n_states * n_observations * n_spots. Log probability.
     Output:
         log_xi: size n_states * n_states * (n_observations-1). xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda)
-    '''
+    """
     n_states = int(log_alpha.shape[0] / 2)
     n_obs = log_alpha.shape[1]
     # initialize log_xi
-    log_xi = np.zeros((2*n_states, 2*n_states, n_obs-1))
+    log_xi = np.zeros((2 * n_states, 2 * n_states, n_obs - 1))
     # compute log_xi
-    for i in np.arange(2*n_states):
-        for j in np.arange(2*n_states):
-            for t in np.arange(n_obs-1):
-                # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+    for i in np.arange(2 * n_states):
+        for j in np.arange(2 * n_states):
+            for t in np.arange(n_obs - 1):
+                # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
                 # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-                log_xi[i, j, t] = log_alpha[i, t] + log_transmat[i - n_states * int(i/n_states), j - n_states * int(j/n_states)] + np.sum(log_emission[j, t+1, :]) + log_beta[j, t+1]
+                log_xi[i, j, t] = (
+                    log_alpha[i, t]
+                    + log_transmat[
+                        i - n_states * int(i / n_states),
+                        j - n_states * int(j / n_states),
+                    ]
+                    + np.sum(log_emission[j, t + 1, :])
+                    + log_beta[j, t + 1]
+                )
     # normalize
-    for t in np.arange(n_obs-1):
+    for t in np.arange(n_obs - 1):
         log_xi[:, :, t] -= mylogsumexp(log_xi[:, :, t])
     return log_xi
 
@@ -319,16 +378,18 @@ def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log
 ############################################################
 @njit
 def update_startprob_sitewise(lengths, log_gamma):
-    '''
+    """
     Input
         lengths: sum of lengths = n_observations.
         log_gamma: size 2 * n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda).
     Output
         log_startprob: n_states. Start probability after loog transformation.
-    '''
+    """
     n_states = int(log_gamma.shape[0] / 2)
     n_obs = log_gamma.shape[1]
-    assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the second dimension of log_gamma!"
+    assert (
+        np.sum(lengths) == n_obs
+    ), "Sum of lengths must be equal to the second dimension of log_gamma!"
     # indices of the start of sequences, given that the length of each sequence is in lengths
     cumlen = 0
     indices_start = []
@@ -341,7 +402,7 @@ def update_startprob_sitewise(lengths, log_gamma):
     # compute log_startprob of 2 * n_states
     log_startprob = mylogsumexp_ax_keep(log_gamma[:, indices_start], axis=1)
     # merge (CNV state, phase A) and (CNV state, phase B)
-    log_startprob = log_startprob.flatten().reshape(2,-1)
+    log_startprob = log_startprob.flatten().reshape(2, -1)
     log_startprob = mylogsumexp_ax_keep(log_startprob, axis=0)
     # normalize such that startprob sums to 1
     log_startprob -= mylogsumexp(log_startprob)
@@ -349,20 +410,28 @@ def update_startprob_sitewise(lengths, log_gamma):
 
 
 def update_transition_sitewise(log_xi, is_diag=False):
-    '''
+    """
     Input
         log_xi: size (2*n_states) * (2*n_states) * n_observations. xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda)
     Output
         log_transmat: n_states * n_states. Transition probability after log transformation.
-    '''
+    """
     n_states = int(log_xi.shape[0] / 2)
     n_obs = log_xi.shape[2]
     # initialize log_transmat
     log_transmat = np.zeros((n_states, n_states))
     for i in np.arange(n_states):
         for j in np.arange(n_states):
-            log_transmat[i, j] = scipy.special.logsumexp( np.concatenate([log_xi[i, j, :], log_xi[i+n_states, j, :], \
-                log_xi[i, j+n_states, :], log_xi[i + n_states, j + n_states, :]]) )
+            log_transmat[i, j] = scipy.special.logsumexp(
+                np.concatenate(
+                    [
+                        log_xi[i, j, :],
+                        log_xi[i + n_states, j, :],
+                        log_xi[i, j + n_states, :],
+                        log_xi[i + n_states, j + n_states, :],
+                    ]
+                )
+            )
     # row normalize log_transmat
     if not is_diag:
         for i in np.arange(n_states):
@@ -372,7 +441,7 @@ def update_transition_sitewise(log_xi, is_diag=False):
         diagsum = scipy.special.logsumexp(np.diag(log_transmat))
         totalsum = scipy.special.logsumexp(log_transmat)
         t = diagsum - totalsum
-        rest = np.log( (1 - np.exp(t)) / (n_states-1) )
+        rest = np.log((1 - np.exp(t)) / (n_states - 1))
         log_transmat = np.ones(log_transmat.shape) * rest
         np.fill_diagonal(log_transmat, t)
     return log_transmat
@@ -384,7 +453,7 @@ def weighted_gaussian_fitting(x, weights):
     weights : 1d array
     """
     mu = weights.dot(x) / np.sum(weights)
-    v = weights.dot( np.square(x - mu) ) / np.sum(weights)
+    v = weights.dot(np.square(x - mu)) / np.sum(weights)
     std = np.sqrt(v)
     return mu, std
 
@@ -399,15 +468,16 @@ def weighted_gaussian_fitting_sharestd(X, Weights):
     mus = np.zeros(n_clusters)
     ssr = np.zeros(X.shape)
     for i in range(n_clusters):
-        mus[i] = Weights[:,i].dot(X[:,i]) / np.sum(Weights[:,i])
-        ssr[:,i] = np.square(X[:,i] - mus[i])
+        mus[i] = Weights[:, i].dot(X[:, i]) / np.sum(Weights[:, i])
+        ssr[:, i] = np.square(X[:, i] - mus[i])
     v = Weights.flatten().dot(ssr.flatten()) / np.sum(Weights)
     stds = np.ones(n_clusters) * np.sqrt(v)
     return mus, stds
 
 
-def update_emission_params_rdr_sitewise(X_rdr, log_gamma, rdr_std, \
-    start_rdr_mean=None, shared_rdr_std=False):
+def update_emission_params_rdr_sitewise(
+    X_rdr, log_gamma, rdr_std, start_rdr_mean=None, shared_rdr_std=False
+):
     """
     Attributes
     ----------
@@ -423,26 +493,41 @@ def update_emission_params_rdr_sitewise(X_rdr, log_gamma, rdr_std, \
     n_spots = X_rdr.shape[1]
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
-    new_rdr_mean = copy.copy(start_rdr_mean) if not start_rdr_mean is None else np.ones((n_states, n_spots))
+    new_rdr_mean = (
+        copy.copy(start_rdr_mean)
+        if not start_rdr_mean is None
+        else np.ones((n_states, n_spots))
+    )
     new_rdr_std = copy.copy(rdr_std)
     # expression signal by NB distribution
     if not shared_rdr_std:
         for s in range(n_spots):
             for i in range(n_states):
-                mu, std = weighted_gaussian_fitting( X_rdr[:,s], gamma[i,:]+gamma[i+n_states,:] )
+                mu, std = weighted_gaussian_fitting(
+                    X_rdr[:, s], gamma[i, :] + gamma[i + n_states, :]
+                )
                 new_rdr_mean[i, s] = mu
-                new_rdr_std[i,s] = std
+                new_rdr_std[i, s] = std
     else:
         for s in range(n_spots):
-            mus, stds = weighted_gaussian_fitting_sharestd( np.vstack([ X_rdr[:,s] for i in range(n_states) ]).T, \
-                (gamma[:n_states, :] + gamma[n_states:, :]).T )
-            new_rdr_mean[:,s] = mus
-            new_rdr_std[:,s] = stds
+            mus, stds = weighted_gaussian_fitting_sharestd(
+                np.vstack([X_rdr[:, s] for i in range(n_states)]).T,
+                (gamma[:n_states, :] + gamma[n_states:, :]).T,
+            )
+            new_rdr_mean[:, s] = mus
+            new_rdr_std[:, s] = stds
     return new_rdr_mean, new_rdr_std
 
 
-def update_emission_params_baf_sitewise(X_baf, log_gamma, p_std, \
-    start_p_mean=None, shared_p_std=False, min_binom_prob=0.01, max_binom_prob=0.99):
+def update_emission_params_baf_sitewise(
+    X_baf,
+    log_gamma,
+    p_std,
+    start_p_mean=None,
+    shared_p_std=False,
+    min_binom_prob=0.01,
+    max_binom_prob=0.99,
+):
     """
     Attributes
     ----------
@@ -459,23 +544,32 @@ def update_emission_params_baf_sitewise(X_baf, log_gamma, p_std, \
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
     # initialization
-    new_p_mean = copy.copy(start_p_mean) if not start_p_mean is None else np.ones((n_states, n_spots)) * 0.5
+    new_p_mean = (
+        copy.copy(start_p_mean)
+        if not start_p_mean is None
+        else np.ones((n_states, n_spots)) * 0.5
+    )
     new_p_std = copy.copy(p_std)
     if not shared_p_std:
         for s in np.arange(X_baf.shape[1]):
             for i in range(n_states):
-                mu, std = weighted_gaussian_fitting( np.append(X_baf[:,s], 1-X_baf[:,s]), np.append(gamma[i,:], gamma[i+n_states,:]) )
+                mu, std = weighted_gaussian_fitting(
+                    np.append(X_baf[:, s], 1 - X_baf[:, s]),
+                    np.append(gamma[i, :], gamma[i + n_states, :]),
+                )
                 new_p_mean[i, s] = mu
                 new_p_std[i, s] = std
     else:
         for s in np.arange(X_baf.shape[1]):
-            concat_X_baf = np.append(X_baf[:,s], 1-X_baf[:,s])
-            concat_gamma = np.hstack([gamma[:n_states,:], gamma[n_states:, :]])
-            mus, stds = weighted_gaussian_fitting_sharestd( np.vstack([ concat_X_baf for i in range(n_states) ]).T, concat_gamma.T)
-            new_p_mean[:,s] = mus
-            new_p_std[:,s] = stds
-            new_p_mean[new_p_mean[:,s] < min_binom_prob, s] = min_binom_prob
-            new_p_mean[new_p_mean[:,s] > max_binom_prob, s] = max_binom_prob
+            concat_X_baf = np.append(X_baf[:, s], 1 - X_baf[:, s])
+            concat_gamma = np.hstack([gamma[:n_states, :], gamma[n_states:, :]])
+            mus, stds = weighted_gaussian_fitting_sharestd(
+                np.vstack([concat_X_baf for i in range(n_states)]).T, concat_gamma.T
+            )
+            new_p_mean[:, s] = mus
+            new_p_std[:, s] = stds
+            new_p_mean[new_p_mean[:, s] < min_binom_prob, s] = min_binom_prob
+            new_p_mean[new_p_mean[:, s] > max_binom_prob, s] = max_binom_prob
     return new_p_mean, new_p_std
 
 
@@ -483,8 +577,9 @@ def update_emission_params_baf_sitewise(X_baf, log_gamma, p_std, \
 # whole inference
 ############################################################
 
+
 class hmm_gaussian_sitewise(object):
-    def __init__(self, params="stmp", t=1-1e-4):
+    def __init__(self, params="stmp", t=1 - 1e-4):
         """
         Attributes
         ----------
@@ -496,11 +591,25 @@ def __init__(self, params="stmp", t=1-1e-4):
         """
         self.params = params
         self.t = t
+
     #
-    def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_transmat, \
-        shared_rdr_std=False, shared_p_std=False, \
-        is_diag=False, init_rdr_mean=None, init_p_mean=None, init_rdr_std=None, init_p_std=None, max_iter=100, tol=1e-4):
-        '''
+    def run_baum_welch_nb_bb_sitewise(
+        self,
+        X,
+        lengths,
+        n_states,
+        log_sitewise_transmat,
+        shared_rdr_std=False,
+        shared_p_std=False,
+        is_diag=False,
+        init_rdr_mean=None,
+        init_p_mean=None,
+        init_rdr_std=None,
+        init_p_std=None,
+        max_iter=100,
+        tol=1e-4,
+    ):
+        """
         Input
             X: size n_observations * n_components * n_spots.
             lengths: sum of lengths = n_observations.
@@ -509,33 +618,59 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans
         Intermediate
             log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
             alpha: size of n_states. Dispersioon parameter of each HMM state.
-        '''
+        """
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
         # initialize NB logmean shift and BetaBinom prob
-        rdr_mean = np.vstack([np.linspace(0.5, 3, n_states) for r in range(n_spots)]).T if init_rdr_mean is None else init_rdr_mean
-        p_mean = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_mean is None else init_p_mean
+        rdr_mean = (
+            np.vstack([np.linspace(0.5, 3, n_states) for r in range(n_spots)]).T
+            if init_rdr_mean is None
+            else init_rdr_mean
+        )
+        p_mean = (
+            np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T
+            if init_p_mean is None
+            else init_p_mean
+        )
         # initialize (inverse of) dispersion param in NB and BetaBinom
-        rdr_std = 0.5 * np.ones((n_states, n_spots)) if init_rdr_std is None else init_rdr_std
+        rdr_std = (
+            0.5 * np.ones((n_states, n_spots)) if init_rdr_std is None else init_rdr_std
+        )
         p_std = 0.1 * np.ones((n_states, n_spots)) if init_p_std is None else init_p_std
         # initialize start probability and emission probability
-        log_startprob = np.log( np.ones(n_states) / n_states )
+        log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
-            transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1)
+            transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
-            log_transmat = np.zeros((1,1))
+            log_transmat = np.zeros((1, 1))
         # EM algorithm
         for r in trange(max_iter):
             # E step
-            log_emission = compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std)
-            log_alpha = forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
-            log_beta = backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
+            log_emission = compute_emission_probability_gaussian(
+                X, rdr_mean, rdr_std, p_mean, p_std
+            )
+            log_alpha = forward_lattice_sitewise(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
+            log_beta = backward_lattice_sitewise(
+                lengths,
+                log_transmat,
+                log_startprob,
+                log_emission,
+                log_sitewise_transmat,
+            )
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
-            log_xi = compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission)
+            log_xi = compute_posterior_transition_sitewise(
+                log_alpha, log_beta, log_transmat, log_emission
+            )
             # M step
             if "s" in self.params:
                 new_log_startprob = update_startprob_sitewise(lengths, log_gamma)
@@ -547,24 +682,40 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans
             else:
                 new_log_transmat = log_transmat
             if "m" in self.params:
-                new_rdr_mean, new_rdr_std = update_emission_params_rdr_sitewise(X[:,0,:], log_gamma, rdr_std, start_rdr_mean=rdr_mean, shared_rdr_std=shared_rdr_std)
+                new_rdr_mean, new_rdr_std = update_emission_params_rdr_sitewise(
+                    X[:, 0, :],
+                    log_gamma,
+                    rdr_std,
+                    start_rdr_mean=rdr_mean,
+                    shared_rdr_std=shared_rdr_std,
+                )
             else:
                 new_rdr_mean = rdr_mean
                 new_rdr_std = rdr_std
             if "p" in self.params:
-                new_p_mean, new_p_std = update_emission_params_baf_sitewise(X[:,1,:], log_gamma, p_std, start_p_mean=p_mean, \
-                    shared_p_std=shared_p_std)
+                new_p_mean, new_p_std = update_emission_params_baf_sitewise(
+                    X[:, 1, :],
+                    log_gamma,
+                    p_std,
+                    start_p_mean=p_mean,
+                    shared_p_std=shared_p_std,
+                )
             else:
                 new_p_mean = p_mean
                 new_p_std = p_std
             # check convergence
-            print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \
-                np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \
-                np.mean(np.abs(new_rdr_mean - rdr_mean)),\
-                np.mean(np.abs(new_p_mean - p_mean)) )
-            print( np.hstack([new_rdr_mean, new_p_mean]) )
-            if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \
-                np.mean(np.abs(new_rdr_mean - rdr_mean)) < tol and np.mean(np.abs(new_p_mean - p_mean)) < tol:
+            print(
+                np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
+                np.mean(np.abs(new_rdr_mean - rdr_mean)),
+                np.mean(np.abs(new_p_mean - p_mean)),
+            )
+            print(np.hstack([new_rdr_mean, new_p_mean]))
+            if (
+                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
+                and np.mean(np.abs(new_rdr_mean - rdr_mean)) < tol
+                and np.mean(np.abs(new_p_mean - p_mean)) < tol
+            ):
                 break
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
@@ -572,7 +723,14 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans
             rdr_std = new_rdr_std
             p_mean = new_p_mean
             p_std = new_p_std
-        return new_rdr_mean, new_rdr_std, new_p_mean, new_p_std, new_log_startprob, new_log_transmat
+        return (
+            new_rdr_mean,
+            new_rdr_std,
+            new_p_mean,
+            new_p_std,
+            new_log_startprob,
+            new_log_transmat,
+        )
 
 
 # def posterior_nb_bb_sitewise(X, lengths, rdr_mean, rdr_std, p_mean, p_std, log_startprob, log_transmat, log_sitewise_transmat):
@@ -705,7 +863,7 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans
 #                 log_v[i, t] = np.max(tmp)
 #         # backtracking to get the sequence
 #         chr_labels = [ np.argmax(log_v[:,-1]) ]
-        
+
 #         if cumlen == 0:
 #             for t2 in np.arange(le-1, 0, -1):
 #                 chr_labels.append( int(bt[chr_labels[-1],t2]))
@@ -717,51 +875,57 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans
 #         # merge two phases
 #         chr_merged_labels = copy.copy(chr_labels)
 #         chr_merged_labels[chr_merged_labels >= n_states] = chr_merged_labels[chr_merged_labels >= n_states] - n_states
-        
+
 #         if cumlen == 0:
 #             labels = chr_labels
 #             merged_labels = chr_merged_labels
 #         else:
 #             labels = np.append(labels, chr_labels)
 #             merged_labels = np.append(merged_labels, chr_merged_labels)
-        
+
 #         cumlen += le
 #     return labels, merged_labels
 
 
 from sklearn.mixture import GaussianMixture
-def initialization_gaussianhmm_by_gmm(n_states, X, params, random_state=None, min_binom_prob=0.1, max_binom_prob=0.9):
+
+
+def initialization_gaussianhmm_by_gmm(
+    n_states, X, params, random_state=None, min_binom_prob=0.1, max_binom_prob=0.9
+):
     # prepare gmm input of RDR and BAF separately
     X_gmm_rdr = None
     X_gmm_baf = None
     if "m" in params:
-        X_gmm_rdr = np.vstack([ X[:,0,s] for s in range(X.shape[2]) ]).T
+        X_gmm_rdr = np.vstack([X[:, 0, s] for s in range(X.shape[2])]).T
     if "p" in params:
-        X_gmm_baf = np.vstack([ X[:,1,s] for s in range(X.shape[2]) ]).T
+        X_gmm_baf = np.vstack([X[:, 1, s] for s in range(X.shape[2])]).T
         X_gmm_baf[X_gmm_baf < min_binom_prob] = min_binom_prob
         X_gmm_baf[X_gmm_baf > max_binom_prob] = max_binom_prob
 
     # combine RDR and BAF
     if ("m" in params) and ("p" in params):
-        indexes = np.where(X_gmm_baf[:,0] > 0.5)[0]
-        X_gmm_baf[indexes,:] = 1 - X_gmm_baf[indexes,:]
+        indexes = np.where(X_gmm_baf[:, 0] > 0.5)[0]
+        X_gmm_baf[indexes, :] = 1 - X_gmm_baf[indexes, :]
         X_gmm = np.hstack([X_gmm_rdr, X_gmm_baf])
     elif "m" in params:
         X_gmm = X_gmm_rdr
     elif "p" in params:
-        indexes = np.where(X_gmm_baf[:,0] > 0.5)[0]
-        X_gmm_baf[indexes,:] = 1 - X_gmm_baf[indexes,:]
+        indexes = np.where(X_gmm_baf[:, 0] > 0.5)[0]
+        X_gmm_baf[indexes, :] = 1 - X_gmm_baf[indexes, :]
         X_gmm = X_gmm_baf
     assert not np.any(np.isnan(X_gmm))
     # run GMM
     if random_state is None:
         gmm = GaussianMixture(n_components=n_states, max_iter=1).fit(X_gmm)
     else:
-        gmm = GaussianMixture(n_components=n_states, max_iter=1, random_state=random_state).fit(X_gmm)
+        gmm = GaussianMixture(
+            n_components=n_states, max_iter=1, random_state=random_state
+        ).fit(X_gmm)
     # turn gmm fitted parameters to HMM rdr_mean and p_mean parameters
     if ("m" in params) and ("p" in params):
-        gmm_rdr_mean = gmm.means_[:,:X.shape[2]]
-        gmm_p_mean = gmm.means_[:, X.shape[2]:]
+        gmm_rdr_mean = gmm.means_[:, : X.shape[2]]
+        gmm_p_mean = gmm.means_[:, X.shape[2] :]
     elif "m" in params:
         gmm_rdr_mean = gmm.means_
         gmm_p_mean = None
@@ -771,37 +935,97 @@ def initialization_gaussianhmm_by_gmm(n_states, X, params, random_state=None, mi
     return gmm_rdr_mean, gmm_p_mean
 
 
-def pipeline_gaussian_baum_welch(X, lengths, n_states, log_sitewise_transmat, params="smp", t=1-1e-6, random_state=0, \
-    shared_rdr_std=True, shared_p_std=True, init_rdr_mean=None, init_p_mean=None, init_rdr_std=None, init_p_std=None, \
-    is_diag=True, max_iter=100, tol=1e-4):
+def pipeline_gaussian_baum_welch(
+    X,
+    lengths,
+    n_states,
+    log_sitewise_transmat,
+    params="smp",
+    t=1 - 1e-6,
+    random_state=0,
+    shared_rdr_std=True,
+    shared_p_std=True,
+    init_rdr_mean=None,
+    init_p_mean=None,
+    init_rdr_std=None,
+    init_p_std=None,
+    is_diag=True,
+    max_iter=100,
+    tol=1e-4,
+):
     # initialization
     n_spots = X.shape[2]
-    if ((init_rdr_mean is None) and ("m" in params)) or ((init_p_mean is None) and ("p" in params)):
-        tmp_rdr_mean, tmp_p_mean = initialization_gaussianhmm_by_gmm(n_states, X, params, random_state=random_state)
+    if ((init_rdr_mean is None) and ("m" in params)) or (
+        (init_p_mean is None) and ("p" in params)
+    ):
+        tmp_rdr_mean, tmp_p_mean = initialization_gaussianhmm_by_gmm(
+            n_states, X, params, random_state=random_state
+        )
         if (init_rdr_mean is None) and ("m" in params):
             init_rdr_mean = tmp_rdr_mean
         if (init_p_mean is None) and ("p" in params):
             init_p_mean = tmp_p_mean
     print(f"init_log_mu = {init_rdr_mean}")
     print(f"init_p_mean = {init_p_mean}")
-    
+
     # fit HMM-NB-BetaBinom
     hmmmodel = hmm_gaussian_sitewise(params=params, t=t)
-    new_rdr_mean, new_rdr_std, new_p_mean, new_p_std, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb_sitewise(X, lengths, \
-        n_states, log_sitewise_transmat, shared_rdr_std=shared_rdr_std, shared_p_std=shared_p_std, is_diag=is_diag, \
-        init_rdr_mean=init_rdr_mean, init_p_mean=init_p_mean, init_rdr_std=init_rdr_std, init_p_std=init_p_std, max_iter=max_iter, tol=tol)
-    
+    (
+        new_rdr_mean,
+        new_rdr_std,
+        new_p_mean,
+        new_p_std,
+        new_log_startprob,
+        new_log_transmat,
+    ) = hmmmodel.run_baum_welch_nb_bb_sitewise(
+        X,
+        lengths,
+        n_states,
+        log_sitewise_transmat,
+        shared_rdr_std=shared_rdr_std,
+        shared_p_std=shared_p_std,
+        is_diag=is_diag,
+        init_rdr_mean=init_rdr_mean,
+        init_p_mean=init_p_mean,
+        init_rdr_std=init_rdr_std,
+        init_p_std=init_p_std,
+        max_iter=max_iter,
+        tol=tol,
+    )
+
     # likelihood, posterior and prediction
-    log_emission = compute_emission_probability_gaussian(X, new_rdr_mean, new_rdr_std, new_p_mean, new_p_std)
-    log_alpha = forward_lattice_sitewise(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat)
-    log_beta = backward_lattice_sitewise(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat)
+    log_emission = compute_emission_probability_gaussian(
+        X, new_rdr_mean, new_rdr_std, new_p_mean, new_p_std
+    )
+    log_alpha = forward_lattice_sitewise(
+        lengths,
+        new_log_transmat,
+        new_log_startprob,
+        log_emission,
+        log_sitewise_transmat,
+    )
+    log_beta = backward_lattice_sitewise(
+        lengths,
+        new_log_transmat,
+        new_log_startprob,
+        log_emission,
+        log_sitewise_transmat,
+    )
     log_gamma = compute_posterior_obs(log_alpha, log_beta)
     pred = np.argmax(log_gamma, axis=0)
     pred_cnv = pred % n_states
-    llf = np.sum(scipy.special.logsumexp(log_alpha[:,np.cumsum(lengths)-1], axis=0))
+    llf = np.sum(scipy.special.logsumexp(log_alpha[:, np.cumsum(lengths) - 1], axis=0))
 
     # save results
-    res = {"new_rdr_mean":new_rdr_mean, "new_rdr_std":new_rdr_std, "new_p_mean":new_p_mean, "new_p_std":new_p_std, \
-            "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf}
+    res = {
+        "new_rdr_mean": new_rdr_mean,
+        "new_rdr_std": new_rdr_std,
+        "new_p_mean": new_p_mean,
+        "new_p_std": new_p_std,
+        "new_log_startprob": new_log_startprob,
+        "new_log_transmat": new_log_transmat,
+        "log_gamma": log_gamma,
+        "pred_cnv": pred_cnv,
+        "llf": llf,
+    }
     return res
-
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index e8e862f..ccc8f0c 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -26,7 +26,21 @@
 # Pure clone
 ############################################################
 
-def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+
+def hmrf_reassignment_posterior(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    res,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     """
     Choosing clones by Iterated Conditional Modes (Forward-backward version):
     for which the emission probability is given by the posterior probability of all HMM states at each bin.
@@ -40,47 +54,103 @@ def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_R
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
     n_states = res["new_p_binom"].shape[0]
-    single_llf = np.zeros((N, n_clones)) # node potential
+    single_llf = np.zeros((N, n_clones))  # node potential
     new_assignment = copy.copy(prev_assignment)
     #
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         for c in range(n_clones):
-            tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                                np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \
-                                                np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)])
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0)
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"][:, c : (c + 1)],
+                    res["new_alphas"][:, c : (c + 1)],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"][:, c : (c + 1)],
+                    res["new_taus"][:, c : (c + 1)],
+                )
+            )
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0)
+                    / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) )
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                )
             else:
-                single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) )
-                    
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                )
+
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             if new_assignment[j] >= 0:
-                w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+                w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+def aggr_hmrf_reassignment(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    res,
+    pred,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     """
     Choosing clones by Iterated Conditional Modes (Viterbi version):
     for which the emission probability of each spot is a single of HMM state sequence.
@@ -97,40 +167,81 @@ def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, re
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         # idx = np.append(idx, np.array([i]))
         for c in range(n_clones):
-            tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                                np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \
-                                                np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)])
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0)
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"][:, c : (c + 1)],
+                    res["new_alphas"][:, c : (c + 1)],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"][:, c : (c + 1)],
+                    res["new_taus"][:, c : (c + 1)],
+                )
+            )
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, idx] > 0)
+                    / np.sum(single_base_nb_mean[:, idx] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0])
             else:
-                single_llf[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
-    
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0])
+
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             if new_assignment[j] >= 0:
-                w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+                w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+def hmrf_reassignment_posterior_concatenate(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    res,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     """
     Input format assumption: the RDR/BAF vector is shared across all clones <- using only BAF signals, or running for each initial clone
     """
@@ -144,39 +255,99 @@ def hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, singl
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
-        tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"])
+        idx = smooth_mat[i, :].nonzero()[1]
+        tmp_log_emission_rdr, tmp_log_emission_baf = (
+            hmmclass.compute_emission_probability_nb_betabinom(
+                np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                res["new_log_mu"],
+                res["new_alphas"],
+                np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                res["new_p_binom"],
+                res["new_taus"],
+            )
+        )
         for c in range(n_clones):
-            if np.sum(single_base_nb_mean[:,i:(i+1)] > 0) > 0 and np.sum(single_total_bb_RD[:,i:(i+1)] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0)
+            if (
+                np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) > 0
+                and np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0)
+                    / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) )
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                )
             else:
-                single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) )
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                )
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
-            w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+        for j in adjacency_mat[i, :].nonzero()[1]:
+            w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def aggr_hmrf_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+def aggr_hmrf_reassignment_concatenate(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    res,
+    pred,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     """
     HMRF assign spots to tumor clones.
 
@@ -230,43 +401,79 @@ def aggr_hmrf_reassignment_concatenate(single_X, single_base_nb_mean, single_tot
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         # idx = np.append(idx, np.array([i]))
-        tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"])
+        tmp_log_emission_rdr, tmp_log_emission_baf = (
+            hmmclass.compute_emission_probability_nb_betabinom(
+                np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                res["new_log_mu"],
+                res["new_alphas"],
+                np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                res["new_p_binom"],
+                res["new_taus"],
+            )
+        )
         for c in range(n_clones):
-            this_pred = pred[(c*n_obs):(c*n_obs+n_obs)]
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0)
+            this_pred = pred[(c * n_obs) : (c * n_obs + n_obs)]
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, idx] > 0)
+                    / np.sum(single_base_nb_mean[:, idx] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
             else:
-                single_llf[i,c] = np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         # new_assignment[i] = np.argmax( w_node )
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
-            w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+        for j in adjacency_mat[i, :].nonzero()[1]:
+            w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds=50, min_umicount_thresholds=0, single_tumor_prop=None, threshold=0.5):
+def merge_by_minspots(
+    assignment,
+    res,
+    single_total_bb_RD,
+    min_spots_thresholds=50,
+    min_umicount_thresholds=0,
+    single_tumor_prop=None,
+    threshold=0.5,
+):
     n_clones = len(np.unique(assignment))
     if n_clones == 1:
-        merged_groups = [ [assignment[0]] ]
+        merged_groups = [[assignment[0]]]
         return merged_groups, res
 
     n_obs = int(len(res["pred_cnv"]) / n_clones)
@@ -277,19 +484,44 @@ def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds=
         tmp_single_tumor_prop = single_tumor_prop
     unique_assignment = np.unique(new_assignment)
     # find entries in unique_assignment such that either min_spots_thresholds or min_umicount_thresholds is not satisfied
-    failed_clones = [ c for c in unique_assignment if (np.sum(new_assignment[tmp_single_tumor_prop > threshold] == c) < min_spots_thresholds) or \
-                     (np.sum(single_total_bb_RD[:, (new_assignment == c)&(tmp_single_tumor_prop > threshold)]) < min_umicount_thresholds) ]
+    failed_clones = [
+        c
+        for c in unique_assignment
+        if (
+            np.sum(new_assignment[tmp_single_tumor_prop > threshold] == c)
+            < min_spots_thresholds
+        )
+        or (
+            np.sum(
+                single_total_bb_RD[
+                    :, (new_assignment == c) & (tmp_single_tumor_prop > threshold)
+                ]
+            )
+            < min_umicount_thresholds
+        )
+    ]
     # find the remaining unique_assigment that satisfies both thresholds
-    successful_clones = [ c for c in unique_assignment if not c in failed_clones ]
+    successful_clones = [c for c in unique_assignment if not c in failed_clones]
     # initial merging groups: each successful clone is its own group
     merging_groups = [[i] for i in successful_clones]
     # for each failed clone, assign them to the closest successful clone
     if len(failed_clones) > 0:
         for c in failed_clones:
-            idx_max = np.argmax([np.sum(single_total_bb_RD[:, (new_assignment == c_prime)&(tmp_single_tumor_prop > threshold)]) for c_prime in successful_clones])
+            idx_max = np.argmax(
+                [
+                    np.sum(
+                        single_total_bb_RD[
+                            :,
+                            (new_assignment == c_prime)
+                            & (tmp_single_tumor_prop > threshold),
+                        ]
+                    )
+                    for c_prime in successful_clones
+                ]
+            )
             merging_groups[idx_max].append(c)
     map_clone_id = {}
-    for i,x in enumerate(merging_groups):
+    for i, x in enumerate(merging_groups):
         for z in x:
             map_clone_id[z] = i
     new_assignment = np.array([map_clone_id[x] for x in new_assignment])
@@ -309,16 +541,55 @@ def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds=
     merged_res = copy.copy(res)
     merged_res["new_assignment"] = new_assignment
     merged_res["total_llf"] = np.NAN
-    merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
-    merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
+    merged_res["pred_cnv"] = np.concatenate(
+        [
+            res["pred_cnv"][(c[0] * n_obs) : (c[0] * n_obs + n_obs)]
+            for c in merging_groups
+        ]
+    )
+    merged_res["log_gamma"] = np.hstack(
+        [
+            res["log_gamma"][:, (c[0] * n_obs) : (c[0] * n_obs + n_obs)]
+            for c in merging_groups
+        ]
+    )
     return merging_groups, merged_res
 
 
-def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states, \
-    log_sitewise_transmat, coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", \
-    hmmclass=hmm_sitewise, params="stmp", t=1-1e-6, random_state=0, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\
-    fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \
-    is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0):
+def hmrf_pipeline(
+    outdir,
+    single_X,
+    lengths,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    initial_clone_index,
+    n_states,
+    log_sitewise_transmat,
+    coords=None,
+    smooth_mat=None,
+    adjacency_mat=None,
+    sample_ids=None,
+    max_iter_outer=5,
+    nodepotential="max",
+    hmmclass=hmm_sitewise,
+    params="stmp",
+    t=1 - 1e-6,
+    random_state=0,
+    init_log_mu=None,
+    init_p_binom=None,
+    init_alphas=None,
+    init_taus=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=True,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=True,
+    is_diag=True,
+    max_iter=100,
+    tol=1e-4,
+    unit_xsquared=9,
+    unit_ysquared=3,
+    spatial_weight=1.0,
+):
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
     # spot adjacency matric
@@ -331,14 +602,25 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b
     else:
         unique_sample_ids = np.unique(sample_ids)
         n_samples = len(unique_sample_ids)
-        tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))}
-        sample_ids = np.array([ tmp_map_index[x] for x in sample_ids])
+        tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
+        sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
     # pseudobulk
-    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index)
+    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+        single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
+    )
     # initialize HMM parameters by GMM
     if (init_log_mu is None) or (init_p_binom is None):
-        init_log_mu, init_p_binom = initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=random_state, in_log_space=False, only_minor=False)
+        init_log_mu, init_p_binom = initialization_by_gmm(
+            n_states,
+            X,
+            base_nb_mean,
+            total_bb_RD,
+            params,
+            random_state=random_state,
+            in_log_space=False,
+            only_minor=False,
+        )
     # initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
@@ -352,32 +634,74 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b
     last_alphas = init_alphas
     last_taus = init_taus
     last_assignment = np.zeros(single_X.shape[2], dtype=int)
-    for c,idx in enumerate(initial_clone_index):
+    for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
     # HMM
     for r in range(max_iter_outer):
         if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists():
             ##### initialize with the parameters of last iteration #####
-            res = pipeline_baum_welch(None, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, \
-                              hmmclass=hmmclass, params=params, t=t, random_state=random_state, \
-                              fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
-                              fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
-                              is_diag=is_diag, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=max_iter, tol=tol)
+            res = pipeline_baum_welch(
+                None,
+                X,
+                lengths,
+                n_states,
+                base_nb_mean,
+                total_bb_RD,
+                log_sitewise_transmat,
+                hmmclass=hmmclass,
+                params=params,
+                t=t,
+                random_state=random_state,
+                fix_NB_dispersion=fix_NB_dispersion,
+                shared_NB_dispersion=shared_NB_dispersion,
+                fix_BB_dispersion=fix_BB_dispersion,
+                shared_BB_dispersion=shared_BB_dispersion,
+                is_diag=is_diag,
+                init_log_mu=last_log_mu,
+                init_p_binom=last_p_binom,
+                init_alphas=last_alphas,
+                init_taus=last_taus,
+                max_iter=max_iter,
+                tol=tol,
+            )
             pred = np.argmax(res["log_gamma"], axis=0)
             # clone assignmment
             if nodepotential == "max":
-                new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    res,
+                    pred,
+                    smooth_mat,
+                    adjacency_mat,
+                    last_assignment,
+                    sample_ids,
+                    log_persample_weights,
+                    spatial_weight=spatial_weight,
+                    hmmclass=hmmclass,
+                )
             elif nodepotential == "weighted_sum":
-                new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    res,
+                    smooth_mat,
+                    adjacency_mat,
+                    last_assignment,
+                    sample_ids,
+                    log_persample_weights,
+                    spatial_weight=spatial_weight,
+                    hmmclass=hmmclass,
+                )
             else:
                 raise Exception("Unknown mode for nodepotential!")
             # handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
-                re_indexing = {c:i for i,c in enumerate(remaining_clones)}
+                re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
             #
             res["prev_assignment"] = last_assignment
@@ -391,18 +715,49 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b
             res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz")
 
         # regroup to pseudobulk
-        clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))]
-        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index)
+        clone_index = [
+            np.where(res["new_assignment"] == c)[0]
+            for c in np.sort(np.unique(res["new_assignment"]))
+        ]
+        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+            single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+        )
 
         # update last parameter
         if "mp" in params:
-            print("outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
+            print(
+                "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
+                    r,
+                    res["total_llf"],
+                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
+                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
+                )
+            )
         elif "m" in params:
-            print("outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])) ))
+            print(
+                "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
+                    r,
+                    res["total_llf"],
+                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
+                )
+            )
         elif "p" in params:
-            print("outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
-        print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ))
-        if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1:
+            print(
+                "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
+                    r,
+                    res["total_llf"],
+                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
+                )
+            )
+        print(
+            "outer iteration {}: ARI between assignment = {}".format(
+                r, adjusted_rand_score(last_assignment, res["new_assignment"])
+            )
+        )
+        if (
+            adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
+            or len(np.unique(res["new_assignment"])) == 1
+        ):
             break
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
@@ -412,16 +767,52 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
-            this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index)
-            log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
-            log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
-
-
-def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states, \
-    log_sitewise_transmat, coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", hmmclass=hmm_sitewise, \
-    params="stmp", t=1-1e-6, random_state=0, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\
-    fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \
-    is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0):
+            this_persample_weight = np.bincount(
+                res["new_assignment"][index], minlength=X.shape[2]
+            ) / len(index)
+            log_persample_weights[:, sidx] = np.where(
+                this_persample_weight > 0, np.log(this_persample_weight), -50
+            )
+            log_persample_weights[:, sidx] = log_persample_weights[
+                :, sidx
+            ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
+
+
+def hmrf_concatenate_pipeline(
+    outdir,
+    prefix,
+    single_X,
+    lengths,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    initial_clone_index,
+    n_states,
+    log_sitewise_transmat,
+    coords=None,
+    smooth_mat=None,
+    adjacency_mat=None,
+    sample_ids=None,
+    max_iter_outer=5,
+    nodepotential="max",
+    hmmclass=hmm_sitewise,
+    params="stmp",
+    t=1 - 1e-6,
+    random_state=0,
+    init_log_mu=None,
+    init_p_binom=None,
+    init_alphas=None,
+    init_taus=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=True,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=True,
+    is_diag=True,
+    max_iter=100,
+    tol=1e-4,
+    unit_xsquared=9,
+    unit_ysquared=3,
+    spatial_weight=1.0,
+):
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
     # checking input
@@ -434,15 +825,27 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_
     else:
         unique_sample_ids = np.unique(sample_ids)
         n_samples = len(unique_sample_ids)
-        tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))}
-        sample_ids = np.array([ tmp_map_index[x] for x in sample_ids])
+        tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
+        sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
     # pseudobulk
-    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index)
+    X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+        single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
+    )
     # initialize HMM parameters by GMM
     if (init_log_mu is None) or (init_p_binom is None):
-        init_log_mu, init_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params, random_state=random_state, in_log_space=False, only_minor=False)
+        init_log_mu, init_p_binom = initialization_by_gmm(
+            n_states,
+            np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                -1, 2, 1
+            ),
+            base_nb_mean.flatten("F").reshape(-1, 1),
+            total_bb_RD.flatten("F").reshape(-1, 1),
+            params,
+            random_state=random_state,
+            in_log_space=False,
+            only_minor=False,
+        )
     # initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
@@ -456,51 +859,112 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_
     last_alphas = init_alphas
     last_taus = init_taus
     last_assignment = np.zeros(single_X.shape[2], dtype=int)
-    for c,idx in enumerate(initial_clone_index):
+    for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
 
     # HMM
     for r in range(max_iter_outer):
         # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization
-        allres = np.load(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True)
+        allres = np.load(
+            f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True
+        )
         allres = dict(allres)
         if allres["num_iterations"] > r:
-            res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-                "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-                "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-                "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-                "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
-        else:      
-            res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \
-                            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), \
-                            hmmclass=hmmclass, params=params, t=t, random_state=random_state, \
-                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
-                            is_diag=is_diag, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=max_iter, tol=tol)
+            res = {
+                "new_log_mu": allres[f"round{r}_new_log_mu"],
+                "new_alphas": allres[f"round{r}_new_alphas"],
+                "new_p_binom": allres[f"round{r}_new_p_binom"],
+                "new_taus": allres[f"round{r}_new_taus"],
+                "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+                "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+                "log_gamma": allres[f"round{r}_log_gamma"],
+                "pred_cnv": allres[f"round{r}_pred_cnv"],
+                "llf": allres[f"round{r}_llf"],
+                "total_llf": allres[f"round{r}_total_llf"],
+                "prev_assignment": allres[f"round{r-1}_assignment"],
+                "new_assignment": allres[f"round{r}_assignment"],
+            }
+        else:
+            res = pipeline_baum_welch(
+                None,
+                np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                    -1, 2, 1
+                ),
+                np.tile(lengths, X.shape[2]),
+                n_states,
+                base_nb_mean.flatten("F").reshape(-1, 1),
+                total_bb_RD.flatten("F").reshape(-1, 1),
+                np.tile(log_sitewise_transmat, X.shape[2]),
+                hmmclass=hmmclass,
+                params=params,
+                t=t,
+                random_state=random_state,
+                fix_NB_dispersion=fix_NB_dispersion,
+                shared_NB_dispersion=shared_NB_dispersion,
+                fix_BB_dispersion=fix_BB_dispersion,
+                shared_BB_dispersion=shared_BB_dispersion,
+                is_diag=is_diag,
+                init_log_mu=last_log_mu,
+                init_p_binom=last_p_binom,
+                init_alphas=last_alphas,
+                init_taus=last_taus,
+                max_iter=max_iter,
+                tol=tol,
+            )
             pred = np.argmax(res["log_gamma"], axis=0)
             # HMRF clone assignmment
             if nodepotential == "max":
-                new_assignment, single_llf, total_llf = aggr_hmrf_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = (
+                    aggr_hmrf_reassignment_concatenate(
+                        single_X,
+                        single_base_nb_mean,
+                        single_total_bb_RD,
+                        res,
+                        pred,
+                        smooth_mat,
+                        adjacency_mat,
+                        last_assignment,
+                        sample_ids,
+                        log_persample_weights,
+                        spatial_weight=spatial_weight,
+                        hmmclass=hmmclass,
+                    )
+                )
             elif nodepotential == "weighted_sum":
-                new_assignment, single_llf, total_llf = hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = (
+                    hmrf_reassignment_posterior_concatenate(
+                        single_X,
+                        single_base_nb_mean,
+                        single_total_bb_RD,
+                        res,
+                        smooth_mat,
+                        adjacency_mat,
+                        last_assignment,
+                        sample_ids,
+                        log_persample_weights,
+                        spatial_weight=spatial_weight,
+                        hmmclass=hmmclass,
+                    )
+                )
             else:
                 raise Exception("Unknown mode for nodepotential!")
             # handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
-                re_indexing = {c:i for i,c in enumerate(remaining_clones)}
+                re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
-                concat_idx = np.concatenate([ np.arange(c*n_obs, c*n_obs+n_obs) for c in remaining_clones ])
-                res["log_gamma"] = res["log_gamma"][:,concat_idx]
+                concat_idx = np.concatenate(
+                    [np.arange(c * n_obs, c * n_obs + n_obs) for c in remaining_clones]
+                )
+                res["log_gamma"] = res["log_gamma"][:, concat_idx]
                 res["pred_cnv"] = res["pred_cnv"][concat_idx]
             #
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
             # append to allres
-            for k,v in res.items():
+            for k, v in res.items():
                 if k == "prev_assignment":
                     allres[f"round{r-1}_assignment"] = v
                 elif k == "new_assignment":
@@ -511,18 +975,44 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
         #
         # regroup to pseudobulk
-        clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))]
-        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index)
+        clone_index = [
+            np.where(res["new_assignment"] == c)[0]
+            for c in np.sort(np.unique(res["new_assignment"]))
+        ]
+        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+            single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+        )
         #
         if "mp" in params:
-            print("outer iteration {}: difference between parameters = {}, {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
+            print(
+                "outer iteration {}: difference between parameters = {}, {}".format(
+                    r,
+                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
+                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
+                )
+            )
         elif "m" in params:
-            print("outer iteration {}: difference between NB parameters = {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])) ))
+            print(
+                "outer iteration {}: difference between NB parameters = {}".format(
+                    r, np.mean(np.abs(last_log_mu - res["new_log_mu"]))
+                )
+            )
         elif "p" in params:
-            print("outer iteration {}: difference between BetaBinom parameters = {}".format( r, np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
-        print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ))
+            print(
+                "outer iteration {}: difference between BetaBinom parameters = {}".format(
+                    r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
+                )
+            )
+        print(
+            "outer iteration {}: ARI between assignment = {}".format(
+                r, adjusted_rand_score(last_assignment, res["new_assignment"])
+            )
+        )
         # if np.all( last_assignment == res["new_assignment"] ):
-        if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1:
+        if (
+            adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
+            or len(np.unique(res["new_assignment"])) == 1
+        ):
             break
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
@@ -532,17 +1022,38 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
-            this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index)
-            log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
-            log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
-
+            this_persample_weight = np.bincount(
+                res["new_assignment"][index], minlength=X.shape[2]
+            ) / len(index)
+            log_persample_weights[:, sidx] = np.where(
+                this_persample_weight > 0, np.log(this_persample_weight), -50
+            )
+            log_persample_weights[:, sidx] = log_persample_weights[
+                :, sidx
+            ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
 
 
 ############################################################
 # Normal-tumor clone mixture
 ############################################################
 
-def aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+
+def aggr_hmrfmix_reassignment(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    res,
+    pred,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
@@ -555,46 +1066,98 @@ def aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD,
     posterior = np.zeros((N, n_clones))
     #
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
         for c in range(n_clones):
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0:
-                mu = np.exp(res["new_log_mu"][(pred%n_states),:]) / np.sum(np.exp(res["new_log_mu"][(pred%n_states),:]) * lambd)
-                weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / (np.mean(single_tumor_prop[idx]) * mu + 1 - np.mean(single_tumor_prop[idx]))
+            if np.sum(single_base_nb_mean[:, idx] > 0) > 0:
+                mu = np.exp(res["new_log_mu"][(pred % n_states), :]) / np.sum(
+                    np.exp(res["new_log_mu"][(pred % n_states), :]) * lambd
+                )
+                weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / (
+                    np.mean(single_tumor_prop[idx]) * mu
+                    + 1
+                    - np.mean(single_tumor_prop[idx])
+                )
             else:
-                weighted_tp = np.repeat(np.mean(single_tumor_prop[idx]), single_X.shape[0])
-            tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), weighted_tp.reshape(-1,1) )
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0)
+                weighted_tp = np.repeat(
+                    np.mean(single_tumor_prop[idx]), single_X.shape[0]
+                )
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"][:, c : (c + 1)],
+                    res["new_alphas"][:, c : (c + 1)],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"][:, c : (c + 1)],
+                    res["new_taus"][:, c : (c + 1)],
+                    np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]),
+                    weighted_tp.reshape(-1, 1),
+                )
+            )
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, idx] > 0)
+                    / np.sum(single_base_nb_mean[:, idx] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0])
             else:
-                single_llf[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
+                single_llf[i, c] = np.sum(
+                    tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0])
         #
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             if new_assignment[j] >= 0:
                 # w_edge[new_assignment[j]] += 1
-                w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+                w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
     #
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+def hmrfmix_reassignment_posterior(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    res,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
@@ -607,53 +1170,136 @@ def hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_b
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
         for c in range(n_clones):
             if np.sum(single_base_nb_mean) > 0:
-                this_pred_cnv = res["pred_cnv"][:,c]
-                logmu_shift = np.array( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,c] + np.log(lambd), axis=0) )
-                kwargs = {"logmu_shift":logmu_shift.reshape(1,1), "sample_length":np.array([n_obs])}
+                this_pred_cnv = res["pred_cnv"][:, c]
+                logmu_shift = np.array(
+                    scipy.special.logsumexp(
+                        res["new_log_mu"][this_pred_cnv, c] + np.log(lambd), axis=0
+                    )
+                )
+                kwargs = {
+                    "logmu_shift": logmu_shift.reshape(1, 1),
+                    "sample_length": np.array([n_obs]),
+                }
             else:
                 kwargs = {}
-            tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), **kwargs )
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0)
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"][:, c : (c + 1)],
+                    res["new_alphas"][:, c : (c + 1)],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"][:, c : (c + 1)],
+                    res["new_taus"][:, c : (c + 1)],
+                    np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]),
+                    **kwargs,
+                )
+            )
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0)
+                    / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) )
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                )
             else:
-                single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) )
-        
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c],
+                        axis=0,
+                    )
+                )
+
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             if new_assignment[j] >= 0:
                 # w_edge[new_assignment[j]] += 1
-                w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+                w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \
-    coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", hmmclass=hmm_sitewise, params="stmp", t=1-1e-6, random_state=0, \
-    init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\
-    fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \
-    is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0/6, tumorprop_threshold=0.5):
+def hmrfmix_pipeline(
+    outdir,
+    prefix,
+    single_X,
+    lengths,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    initial_clone_index,
+    n_states,
+    log_sitewise_transmat,
+    coords=None,
+    smooth_mat=None,
+    adjacency_mat=None,
+    sample_ids=None,
+    max_iter_outer=5,
+    nodepotential="max",
+    hmmclass=hmm_sitewise,
+    params="stmp",
+    t=1 - 1e-6,
+    random_state=0,
+    init_log_mu=None,
+    init_p_binom=None,
+    init_alphas=None,
+    init_taus=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=True,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=True,
+    is_diag=True,
+    max_iter=100,
+    tol=1e-4,
+    unit_xsquared=9,
+    unit_ysquared=3,
+    spatial_weight=1.0 / 6,
+    tumorprop_threshold=0.5,
+):
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
     # spot adjacency matric
@@ -666,15 +1312,32 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin
     else:
         unique_sample_ids = np.unique(sample_ids)
         n_samples = len(unique_sample_ids)
-        tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))}
-        sample_ids = np.array([ tmp_map_index[x] for x in sample_ids])
+        tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
+        sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
     # pseudobulk
-    X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=tumorprop_threshold)
+    X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        initial_clone_index,
+        single_tumor_prop,
+        threshold=tumorprop_threshold,
+    )
     # initialize HMM parameters by GMM
     if (init_log_mu is None) or (init_p_binom is None):
-        init_log_mu, init_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params, random_state=random_state, in_log_space=False, only_minor=False)
+        init_log_mu, init_p_binom = initialization_by_gmm(
+            n_states,
+            np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                -1, 2, 1
+            ),
+            base_nb_mean.flatten("F").reshape(-1, 1),
+            total_bb_RD.flatten("F").reshape(-1, 1),
+            params,
+            random_state=random_state,
+            in_log_space=False,
+            only_minor=False,
+        )
     # initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
@@ -688,27 +1351,69 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin
     last_alphas = init_alphas
     last_taus = init_taus
     last_assignment = np.zeros(single_X.shape[2], dtype=int)
-    for c,idx in enumerate(initial_clone_index):
+    for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
     n_clones = len(initial_clone_index)
 
     # HMM
     for r in range(max_iter_outer):
-        allres = np.load(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True)
+        allres = np.load(
+            f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True
+        )
         allres = dict(allres)
         if allres["num_iterations"] > r:
-            res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-                "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-                "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-                "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-                "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
+            res = {
+                "new_log_mu": allres[f"round{r}_new_log_mu"],
+                "new_alphas": allres[f"round{r}_new_alphas"],
+                "new_p_binom": allres[f"round{r}_new_p_binom"],
+                "new_taus": allres[f"round{r}_new_taus"],
+                "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+                "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+                "log_gamma": allres[f"round{r}_log_gamma"],
+                "pred_cnv": allres[f"round{r}_pred_cnv"],
+                "llf": allres[f"round{r}_llf"],
+                "total_llf": allres[f"round{r}_total_llf"],
+                "prev_assignment": allres[f"round{r-1}_assignment"],
+                "new_assignment": allres[f"round{r}_assignment"],
+            }
         else:
-            res = {"new_log_mu":[], "new_alphas":[], "new_p_binom":[], "new_taus":[], "new_log_startprob":[], "new_log_transmat":[], "log_gamma":[], "pred_cnv":[], "llf":[]}
+            res = {
+                "new_log_mu": [],
+                "new_alphas": [],
+                "new_p_binom": [],
+                "new_taus": [],
+                "new_log_startprob": [],
+                "new_log_transmat": [],
+                "log_gamma": [],
+                "pred_cnv": [],
+                "llf": [],
+            }
             for c in range(n_clones):
-                tmpres = pipeline_baum_welch(None, X[:,:,c:(c+1)], lengths, n_states, base_nb_mean[:,c:(c+1)], total_bb_RD[:,c:(c+1)],  log_sitewise_transmat, np.repeat(tumor_prop[c], X.shape[0]).reshape(-1,1), \
-                            hmmclass=hmmclass, params=params, t=t, \
-                            random_state=random_state, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
-                            is_diag=is_diag, init_log_mu=last_log_mu[:,c:(c+1)], init_p_binom=last_p_binom[:,c:(c+1)], init_alphas=last_alphas[:,c:(c+1)], init_taus=last_taus[:,c:(c+1)], max_iter=max_iter, tol=tol)
+                tmpres = pipeline_baum_welch(
+                    None,
+                    X[:, :, c : (c + 1)],
+                    lengths,
+                    n_states,
+                    base_nb_mean[:, c : (c + 1)],
+                    total_bb_RD[:, c : (c + 1)],
+                    log_sitewise_transmat,
+                    np.repeat(tumor_prop[c], X.shape[0]).reshape(-1, 1),
+                    hmmclass=hmmclass,
+                    params=params,
+                    t=t,
+                    random_state=random_state,
+                    fix_NB_dispersion=fix_NB_dispersion,
+                    shared_NB_dispersion=shared_NB_dispersion,
+                    fix_BB_dispersion=fix_BB_dispersion,
+                    shared_BB_dispersion=shared_BB_dispersion,
+                    is_diag=is_diag,
+                    init_log_mu=last_log_mu[:, c : (c + 1)],
+                    init_p_binom=last_p_binom[:, c : (c + 1)],
+                    init_alphas=last_alphas[:, c : (c + 1)],
+                    init_taus=last_taus[:, c : (c + 1)],
+                    max_iter=max_iter,
+                    tol=tol,
+                )
                 pred = np.argmax(tmpres["log_gamma"], axis=0)
                 for k in res.keys():
                     res[k] = [res[k], tmpres[k]]
@@ -723,18 +1428,43 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin
 
             # clone assignmment
             if nodepotential == "max":
-                new_assignment, single_llf, total_llf = aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = aggr_hmrfmix_reassignment(
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    single_tumor_prop,
+                    res,
+                    pred,
+                    smooth_mat,
+                    adjacency_mat,
+                    last_assignment,
+                    sample_ids,
+                    log_persample_weights,
+                    spatial_weight=spatial_weight,
+                    hmmclass=hmmclass,
+                )
             elif nodepotential == "weighted_sum":
-                new_assignment, single_llf, total_llf = hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = hmrfmix_reassignment_posterior(
+                    single_X,
+                    single_base_nb_mean,
+                    single_total_bb_RD,
+                    single_tumor_prop,
+                    res,
+                    smooth_mat,
+                    adjacency_mat,
+                    last_assignment,
+                    sample_ids,
+                    log_persample_weights,
+                    spatial_weight=spatial_weight,
+                    hmmclass=hmmclass,
+                )
             else:
                 raise Exception("Unknown mode for nodepotential!")
             # handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
-                re_indexing = {c:i for i,c in enumerate(remaining_clones)}
+                re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
             #
             res["prev_assignment"] = last_assignment
@@ -742,7 +1472,7 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin
             res["total_llf"] = total_llf
 
             # append to allres
-            for k,v in res.items():
+            for k, v in res.items():
                 if k == "prev_assignment":
                     allres[f"round{r-1}_assignment"] = v
                 elif k == "new_assignment":
@@ -753,19 +1483,55 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
 
         # regroup to pseudobulk
-        clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))]
-        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=tumorprop_threshold)
+        clone_index = [
+            np.where(res["new_assignment"] == c)[0]
+            for c in np.sort(np.unique(res["new_assignment"]))
+        ]
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+            single_X,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            clone_index,
+            single_tumor_prop,
+            threshold=tumorprop_threshold,
+        )
 
         # update last parameter
         if "mp" in params:
-            print("outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
+            print(
+                "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
+                    r,
+                    res["total_llf"],
+                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
+                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
+                )
+            )
         elif "m" in params:
-            print("outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])) ))
+            print(
+                "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
+                    r,
+                    res["total_llf"],
+                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
+                )
+            )
         elif "p" in params:
-            print("outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
-        print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ))
+            print(
+                "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
+                    r,
+                    res["total_llf"],
+                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
+                )
+            )
+        print(
+            "outer iteration {}: ARI between assignment = {}".format(
+                r, adjusted_rand_score(last_assignment, res["new_assignment"])
+            )
+        )
         # if np.all( last_assignment == res["new_assignment"] ):
-        if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1:
+        if (
+            adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
+            or len(np.unique(res["new_assignment"])) == 1
+        ):
             break
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
@@ -775,12 +1541,32 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
-            this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index)
-            log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
-            log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
-
-
-def hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+            this_persample_weight = np.bincount(
+                res["new_assignment"][index], minlength=X.shape[2]
+            ) / len(index)
+            log_persample_weights[:, sidx] = np.where(
+                this_persample_weight > 0, np.log(this_persample_weight), -50
+            )
+            log_persample_weights[:, sidx] = log_persample_weights[
+                :, sidx
+            ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
+
+
+def hmrfmix_reassignment_posterior_concatenate(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    res,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = np.max(prev_assignment) + 1
@@ -792,52 +1578,128 @@ def hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, si
     if np.sum(single_base_nb_mean) > 0:
         logmu_shift = []
         for c in range(n_clones):
-            this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states
-            logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) )
+            this_pred_cnv = (
+                np.argmax(
+                    res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], axis=0
+                )
+                % n_states
+            )
+            logmu_shift.append(
+                scipy.special.logsumexp(
+                    res["new_log_mu"][this_pred_cnv, :] + np.log(lambd).reshape(-1, 1),
+                    axis=0,
+                )
+            )
         logmu_shift = np.vstack(logmu_shift)
-        kwargs = {"logmu_shift":logmu_shift, "sample_length":np.ones(n_clones,dtype=int) * n_obs}
+        kwargs = {
+            "logmu_shift": logmu_shift,
+            "sample_length": np.ones(n_clones, dtype=int) * n_obs,
+        }
     else:
         kwargs = {}
     #
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
         for c in range(n_clones):
-            tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), **kwargs )
-
-            if np.sum(single_base_nb_mean[:,i:(i+1)] > 0) > 0 and np.sum(single_total_bb_RD[:,i:(i+1)] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0)
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"],
+                    res["new_alphas"],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"],
+                    res["new_taus"],
+                    np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]),
+                    **kwargs,
+                )
+            )
+
+            if (
+                np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) > 0
+                and np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0)
+                    / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) )
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                )
             else:
-                single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \
-                    np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) )
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_rdr[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                ) + np.sum(
+                    scipy.special.logsumexp(
+                        tmp_log_emission_baf[:, :, 0]
+                        + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)],
+                        axis=0,
+                    )
+                )
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             # w_edge[new_assignment[j]] += 1
-            w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+            w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def aggr_hmrfmix_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
+def aggr_hmrfmix_reassignment_concatenate(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    res,
+    pred,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+    return_posterior=False,
+):
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = int(len(pred) / n_obs)
@@ -850,50 +1712,120 @@ def aggr_hmrfmix_reassignment_concatenate(single_X, single_base_nb_mean, single_
     posterior = np.zeros((N, n_clones))
     #
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
         for c in range(n_clones):
-            this_pred = pred[(c*n_obs):(c*n_obs+n_obs)]
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0:
-                mu = np.exp(res["new_log_mu"][(this_pred%n_states),:]) / np.sum(np.exp(res["new_log_mu"][(this_pred%n_states),:]) * lambd)
-                weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / (np.mean(single_tumor_prop[idx]) * mu + 1 - np.mean(single_tumor_prop[idx]))
+            this_pred = pred[(c * n_obs) : (c * n_obs + n_obs)]
+            if np.sum(single_base_nb_mean[:, idx] > 0) > 0:
+                mu = np.exp(res["new_log_mu"][(this_pred % n_states), :]) / np.sum(
+                    np.exp(res["new_log_mu"][(this_pred % n_states), :]) * lambd
+                )
+                weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / (
+                    np.mean(single_tumor_prop[idx]) * mu
+                    + 1
+                    - np.mean(single_tumor_prop[idx])
+                )
             else:
-                weighted_tp = np.repeat(np.mean(single_tumor_prop[idx]), single_X.shape[0])
-            tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), weighted_tp.reshape(-1,1) )
-
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0)
+                weighted_tp = np.repeat(
+                    np.mean(single_tumor_prop[idx]), single_X.shape[0]
+                )
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                hmmclass.compute_emission_probability_nb_betabinom_mix(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"],
+                    res["new_alphas"],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"],
+                    res["new_taus"],
+                    np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]),
+                    weighted_tp.reshape(-1, 1),
+                )
+            )
+
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, idx] > 0)
+                    / np.sum(single_base_nb_mean[:, idx] > 0)
+                )
                 # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
+                single_llf[i, c] = ratio_nonzeros * np.sum(
+                    tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
             else:
-                single_llf[i,c] = np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
-        w_node = single_llf[i,:]
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf[i, c] = np.sum(
+                    tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]
+                ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0])
+        w_node = single_llf[i, :]
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             # w_edge[new_assignment[j]] += 1
-            w_edge[new_assignment[j]] += adjacency_mat[i,j]
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
+            w_edge[new_assignment[j]] += adjacency_mat[i, j]
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
         #
-        posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        posterior[i, :] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
     #
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
         return new_assignment, single_llf, total_llf
 
 
-def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \
-    coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", hmmclass=hmm_sitewise, params="stmp", t=1-1e-6, random_state=0, \
-    init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\
-    fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \
-    is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0/6, tumorprop_threshold=0.5):
+def hmrfmix_concatenate_pipeline(
+    outdir,
+    prefix,
+    single_X,
+    lengths,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    initial_clone_index,
+    n_states,
+    log_sitewise_transmat,
+    coords=None,
+    smooth_mat=None,
+    adjacency_mat=None,
+    sample_ids=None,
+    max_iter_outer=5,
+    nodepotential="max",
+    hmmclass=hmm_sitewise,
+    params="stmp",
+    t=1 - 1e-6,
+    random_state=0,
+    init_log_mu=None,
+    init_p_binom=None,
+    init_alphas=None,
+    init_taus=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=True,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=True,
+    is_diag=True,
+    max_iter=100,
+    tol=1e-4,
+    unit_xsquared=9,
+    unit_ysquared=3,
+    spatial_weight=1.0 / 6,
+    tumorprop_threshold=0.5,
+):
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
     # spot adjacency matric
@@ -906,17 +1838,34 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_
     else:
         unique_sample_ids = np.unique(sample_ids)
         n_samples = len(unique_sample_ids)
-        tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))}
-        sample_ids = np.array([ tmp_map_index[x] for x in sample_ids])
+        tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
+        sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * (-np.log(n_clones))
     # pseudobulk
-    X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=tumorprop_threshold)
+    X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        initial_clone_index,
+        single_tumor_prop,
+        threshold=tumorprop_threshold,
+    )
     # baseline proportion of UMI counts
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
     # initialize HMM parameters by GMM
     if (init_log_mu is None) or (init_p_binom is None):
-        init_log_mu, init_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params, random_state=random_state, in_log_space=False, only_minor=False)
+        init_log_mu, init_p_binom = initialization_by_gmm(
+            n_states,
+            np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                -1, 2, 1
+            ),
+            base_nb_mean.flatten("F").reshape(-1, 1),
+            total_bb_RD.flatten("F").reshape(-1, 1),
+            params,
+            random_state=random_state,
+            in_log_space=False,
+            only_minor=False,
+        )
     # initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
@@ -930,56 +1879,120 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_
     last_alphas = init_alphas
     last_taus = init_taus
     last_assignment = np.zeros(single_X.shape[2], dtype=int)
-    for c,idx in enumerate(initial_clone_index):
+    for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
 
     # HMM
     for r in range(max_iter_outer):
         # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization
-        allres = np.load(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True)
+        allres = np.load(
+            f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True
+        )
         allres = dict(allres)
         if allres["num_iterations"] > r:
-            res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-                "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-                "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-                "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-                "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
+            res = {
+                "new_log_mu": allres[f"round{r}_new_log_mu"],
+                "new_alphas": allres[f"round{r}_new_alphas"],
+                "new_p_binom": allres[f"round{r}_new_p_binom"],
+                "new_taus": allres[f"round{r}_new_taus"],
+                "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+                "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+                "log_gamma": allres[f"round{r}_log_gamma"],
+                "pred_cnv": allres[f"round{r}_pred_cnv"],
+                "llf": allres[f"round{r}_llf"],
+                "total_llf": allres[f"round{r}_total_llf"],
+                "prev_assignment": allres[f"round{r-1}_assignment"],
+                "new_assignment": allres[f"round{r}_assignment"],
+            }
         else:
-            sample_length = np.ones(X.shape[2],dtype=int) * X.shape[0]
-            remain_kwargs = {"sample_length":sample_length, "lambd":lambd}
+            sample_length = np.ones(X.shape[2], dtype=int) * X.shape[0]
+            remain_kwargs = {"sample_length": sample_length, "lambd": lambd}
             if f"round{r-1}_log_gamma" in allres:
                 remain_kwargs["log_gamma"] = allres[f"round{r-1}_log_gamma"]
-            res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \
-                            # base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), tumor_prop, \
-                            base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1), \
-                            hmmclass=hmmclass, params=params, t=t, random_state=random_state, \
-                            fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
-                            is_diag=is_diag, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=max_iter, tol=tol, **remain_kwargs)
+            res = pipeline_baum_welch(
+                None,
+                np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                    -1, 2, 1
+                ),
+                np.tile(lengths, X.shape[2]),
+                n_states,  # base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), tumor_prop, \
+                base_nb_mean.flatten("F").reshape(-1, 1),
+                total_bb_RD.flatten("F").reshape(-1, 1),
+                np.tile(log_sitewise_transmat, X.shape[2]),
+                np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1),
+                hmmclass=hmmclass,
+                params=params,
+                t=t,
+                random_state=random_state,
+                fix_NB_dispersion=fix_NB_dispersion,
+                shared_NB_dispersion=shared_NB_dispersion,
+                fix_BB_dispersion=fix_BB_dispersion,
+                shared_BB_dispersion=shared_BB_dispersion,
+                is_diag=is_diag,
+                init_log_mu=last_log_mu,
+                init_p_binom=last_p_binom,
+                init_alphas=last_alphas,
+                init_taus=last_taus,
+                max_iter=max_iter,
+                tol=tol,
+                **remain_kwargs,
+            )
             pred = np.argmax(res["log_gamma"], axis=0)
             # clone assignmment
             if nodepotential == "max":
-                new_assignment, single_llf, total_llf = aggr_hmrfmix_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = (
+                    aggr_hmrfmix_reassignment_concatenate(
+                        single_X,
+                        single_base_nb_mean,
+                        single_total_bb_RD,
+                        single_tumor_prop,
+                        res,
+                        pred,
+                        smooth_mat,
+                        adjacency_mat,
+                        last_assignment,
+                        sample_ids,
+                        log_persample_weights,
+                        spatial_weight=spatial_weight,
+                        hmmclass=hmmclass,
+                    )
+                )
             elif nodepotential == "weighted_sum":
-                new_assignment, single_llf, total_llf = hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, \
-                    smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass)
+                new_assignment, single_llf, total_llf = (
+                    hmrfmix_reassignment_posterior_concatenate(
+                        single_X,
+                        single_base_nb_mean,
+                        single_total_bb_RD,
+                        single_tumor_prop,
+                        res,
+                        smooth_mat,
+                        adjacency_mat,
+                        last_assignment,
+                        sample_ids,
+                        log_persample_weights,
+                        spatial_weight=spatial_weight,
+                        hmmclass=hmmclass,
+                    )
+                )
             else:
                 raise Exception("Unknown mode for nodepotential!")
             # handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
-                re_indexing = {c:i for i,c in enumerate(remaining_clones)}
+                re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
-                concat_idx = np.concatenate([ np.arange(c*n_obs, c*n_obs+n_obs) for c in remaining_clones ])
-                res["log_gamma"] = res["log_gamma"][:,concat_idx]
+                concat_idx = np.concatenate(
+                    [np.arange(c * n_obs, c * n_obs + n_obs) for c in remaining_clones]
+                )
+                res["log_gamma"] = res["log_gamma"][:, concat_idx]
                 res["pred_cnv"] = res["pred_cnv"][concat_idx]
             # add to results
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
             # append to allres
-            for k,v in res.items():
+            for k, v in res.items():
                 if k == "prev_assignment":
                     allres[f"round{r-1}_assignment"] = v
                 elif k == "new_assignment":
@@ -990,18 +2003,49 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
         #
         # regroup to pseudobulk
-        clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))]
-        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=tumorprop_threshold)
+        clone_index = [
+            np.where(res["new_assignment"] == c)[0]
+            for c in np.sort(np.unique(res["new_assignment"]))
+        ]
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+            single_X,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            clone_index,
+            single_tumor_prop,
+            threshold=tumorprop_threshold,
+        )
         #
         if "mp" in params:
-            print("outer iteration {}: difference between parameters = {}, {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
+            print(
+                "outer iteration {}: difference between parameters = {}, {}".format(
+                    r,
+                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
+                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
+                )
+            )
         elif "m" in params:
-            print("outer iteration {}: difference between NB parameters = {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])) ))
+            print(
+                "outer iteration {}: difference between NB parameters = {}".format(
+                    r, np.mean(np.abs(last_log_mu - res["new_log_mu"]))
+                )
+            )
         elif "p" in params:
-            print("outer iteration {}: difference between BetaBinom parameters = {}".format( r, np.mean(np.abs(last_p_binom-res["new_p_binom"])) ))
-        print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ))
+            print(
+                "outer iteration {}: difference between BetaBinom parameters = {}".format(
+                    r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
+                )
+            )
+        print(
+            "outer iteration {}: ARI between assignment = {}".format(
+                r, adjusted_rand_score(last_assignment, res["new_assignment"])
+            )
+        )
         # if np.all( last_assignment == res["new_assignment"] ):
-        if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1:
+        if (
+            adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
+            or len(np.unique(res["new_assignment"])) == 1
+        ):
             break
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
@@ -1011,15 +2055,37 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
-            this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index)
-            log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50)
-            log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx])
+            this_persample_weight = np.bincount(
+                res["new_assignment"][index], minlength=X.shape[2]
+            ) / len(index)
+            log_persample_weights[:, sidx] = np.where(
+                this_persample_weight > 0, np.log(this_persample_weight), -50
+            )
+            log_persample_weights[:, sidx] = log_persample_weights[
+                :, sidx
+            ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
 
 
 ############################################################
 # Final posterior using integer copy numbers
 ############################################################
-def clonelabel_posterior_withinteger(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, state_cnv, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, base_nb_mean, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise):
+def clonelabel_posterior_withinteger(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    state_cnv,
+    res,
+    pred,
+    smooth_mat,
+    adjacency_mat,
+    prev_assignment,
+    sample_ids,
+    base_nb_mean,
+    log_persample_weights,
+    spatial_weight,
+    hmmclass=hmm_sitewise,
+):
     """
     single_X : array, (n_obs, 2, n_spots)
 
@@ -1046,74 +2112,156 @@ def clonelabel_posterior_withinteger(single_X, single_base_nb_mean, single_total
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     # clone IDs
-    tmp_clone_ids = np.array([x[5:].split(" ")[0] for x in state_cnv.columns if x[:5] == "clone"])
-    clone_ids = np.array([x for i,x in enumerate(tmp_clone_ids) if i == 0 or x != tmp_clone_ids[i-1]])
+    tmp_clone_ids = np.array(
+        [x[5:].split(" ")[0] for x in state_cnv.columns if x[:5] == "clone"]
+    )
+    clone_ids = np.array(
+        [x for i, x in enumerate(tmp_clone_ids) if i == 0 or x != tmp_clone_ids[i - 1]]
+    )
     n_clones = len(clone_ids)
     n_states = state_cnv.shape[0]
     # parameter based on integer copy numbers
-    lambd = base_nb_mean / np.sum(base_nb_mean, axis=0, keepdims=True) if n_clones == base_nb_mean.shape[1] else base_nb_mean[:,1:] / np.sum(base_nb_mean[:,1:], axis=0, keepdims=True)
+    lambd = (
+        base_nb_mean / np.sum(base_nb_mean, axis=0, keepdims=True)
+        if n_clones == base_nb_mean.shape[1]
+        else base_nb_mean[:, 1:] / np.sum(base_nb_mean[:, 1:], axis=0, keepdims=True)
+    )
     log_mu_icn = np.zeros((n_states, n_clones))
-    for c,cid in enumerate(clone_ids):
-        log_mu_icn[:,c] = np.log( (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"]) / lambd[:,c].dot( (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"])[pred[:,c]] ) )
-    p_binom_icn = np.array([ state_cnv[f"clone{cid} A"] / (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"]) for cid in clone_ids ]).T
+    for c, cid in enumerate(clone_ids):
+        log_mu_icn[:, c] = np.log(
+            (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"])
+            / lambd[:, c].dot(
+                (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"])[pred[:, c]]
+            )
+        )
+    p_binom_icn = np.array(
+        [
+            state_cnv[f"clone{cid} A"]
+            / (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"])
+            for cid in clone_ids
+        ]
+    ).T
     # handle 0 in p_binom_icn
     if n_clones == res["new_p_binom"].shape[1]:
-        p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][((p_binom_icn == 0) | (p_binom_icn == 1))]
+        p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][
+            ((p_binom_icn == 0) | (p_binom_icn == 1))
+        ]
     elif n_clones + 1 == res["new_p_binom"].shape[1]:
-        p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][:,1:][((p_binom_icn == 0) | (p_binom_icn == 1))]
+        p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][
+            :, 1:
+        ][((p_binom_icn == 0) | (p_binom_icn == 1))]
     # over-dispersion
-    new_alphas = copy.copy(res["new_alphas"]) if n_clones == res["new_p_binom"].shape[1] else copy.copy(res["new_alphas"][:,1:])
-    new_alphas[:,:] = np.max(new_alphas)
-    new_taus = copy.copy(res["new_taus"]) if n_clones == res["new_p_binom"].shape[1] else copy.copy(res["new_taus"][:,1:])
-    new_taus[:,:] = np.min(new_taus)
+    new_alphas = (
+        copy.copy(res["new_alphas"])
+        if n_clones == res["new_p_binom"].shape[1]
+        else copy.copy(res["new_alphas"][:, 1:])
+    )
+    new_alphas[:, :] = np.max(new_alphas)
+    new_taus = (
+        copy.copy(res["new_taus"])
+        if n_clones == res["new_p_binom"].shape[1]
+        else copy.copy(res["new_taus"][:, 1:])
+    )
+    new_taus[:, :] = np.min(new_taus)
     # result variables
     single_llf_rdr = np.zeros((N, n_clones))
     single_llf_baf = np.zeros((N, n_clones))
     single_llf = np.zeros((N, n_clones))
-    df_posterior = pd.DataFrame({k:np.zeros(N) for k in [f"post_BAF_clone_{cid}" for cid in clone_ids] + [f"post_RDR_clone_{cid}" for cid in clone_ids] + \
-                                 [f"post_nodellf_clone_{cid}" for cid in clone_ids] + [f"post_combine_clone_{cid}" for cid in clone_ids] })
+    df_posterior = pd.DataFrame(
+        {
+            k: np.zeros(N)
+            for k in [f"post_BAF_clone_{cid}" for cid in clone_ids]
+            + [f"post_RDR_clone_{cid}" for cid in clone_ids]
+            + [f"post_nodellf_clone_{cid}" for cid in clone_ids]
+            + [f"post_combine_clone_{cid}" for cid in clone_ids]
+        }
+    )
     #
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1]
+        idx = smooth_mat[i, :].nonzero()[1]
         if not (single_tumor_prop is None):
             idx = idx[~np.isnan(single_tumor_prop[idx])]
         for c in range(n_clones):
             if single_tumor_prop is None:
-                tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), log_mu_icn[:,c:(c+1)], new_alphas[:,c:(c+1)], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), p_binom_icn[:,c:(c+1)], new_taus[:,c:(c+1)] )
+                tmp_log_emission_rdr, tmp_log_emission_baf = (
+                    hmmclass.compute_emission_probability_nb_betabinom(
+                        np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                        np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                        log_mu_icn[:, c : (c + 1)],
+                        new_alphas[:, c : (c + 1)],
+                        np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                        p_binom_icn[:, c : (c + 1)],
+                        new_taus[:, c : (c + 1)],
+                    )
+                )
             else:
-                tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                                            np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), log_mu_icn[:,c:(c+1)], new_alphas[:,c:(c+1)], \
-                                            np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), p_binom_icn[:,c:(c+1)], new_taus[:,c:(c+1)], np.repeat(np.mean(single_tumor_prop[idx]), single_X.shape[0]).reshape(-1,1) )
+                tmp_log_emission_rdr, tmp_log_emission_baf = (
+                    hmmclass.compute_emission_probability_nb_betabinom_mix(
+                        np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                        np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                        log_mu_icn[:, c : (c + 1)],
+                        new_alphas[:, c : (c + 1)],
+                        np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                        p_binom_icn[:, c : (c + 1)],
+                        new_taus[:, c : (c + 1)],
+                        np.repeat(
+                            np.mean(single_tumor_prop[idx]), single_X.shape[0]
+                        ).reshape(-1, 1),
+                    )
+                )
             assert not np.any(np.isnan(tmp_log_emission_rdr))
             assert not np.any(np.isnan(tmp_log_emission_baf))
             # !!! tmp_log_emission_baf may be NAN
             # Because LoH leads to Beta-binomial p = 0 or 1, but both A and B alleles are observed in the data, which leads to Nan.
             # We don't directly model the erroneous measurements associated with LoH.
             #
-            if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0)
-                single_llf_rdr[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0])
-                single_llf_baf[i,c] = np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
-                single_llf[i,c] = single_llf_rdr[i,c] + single_llf_baf[i,c]
+            if (
+                np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+            ):
+                ratio_nonzeros = (
+                    1.0
+                    * np.sum(single_total_bb_RD[:, idx] > 0)
+                    / np.sum(single_base_nb_mean[:, idx] > 0)
+                )
+                single_llf_rdr[i, c] = ratio_nonzeros * np.sum(
+                    tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                )
+                single_llf_baf[i, c] = np.sum(
+                    tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]
+                )
+                single_llf[i, c] = single_llf_rdr[i, c] + single_llf_baf[i, c]
             else:
-                single_llf_rdr[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0])
-                single_llf_baf[i,c] = np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
-                single_llf[i,c] = single_llf_rdr[i,c] + single_llf_baf[i,c]
-        
-        w_node = copy.copy(single_llf[i,:])
-        w_node += log_persample_weights[:,sample_ids[i]]
+                single_llf_rdr[i, c] = np.sum(
+                    tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                )
+                single_llf_baf[i, c] = np.sum(
+                    tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]
+                )
+                single_llf[i, c] = single_llf_rdr[i, c] + single_llf_baf[i, c]
+
+        w_node = copy.copy(single_llf[i, :])
+        w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             if n_clones == base_nb_mean.shape[1]:
-                w_edge[prev_assignment[j]] += adjacency_mat[i,j]
+                w_edge[prev_assignment[j]] += adjacency_mat[i, j]
             else:
-                w_edge[prev_assignment[j] - 1] += adjacency_mat[i,j]
+                w_edge[prev_assignment[j] - 1] += adjacency_mat[i, j]
         #
-        df_posterior.iloc[i,:n_clones] = np.exp( single_llf_baf[i,:] - scipy.special.logsumexp(single_llf_baf[i,:]) )
-        df_posterior.iloc[i,n_clones:(2*n_clones)] = np.exp( single_llf_rdr[i,:] - scipy.special.logsumexp(single_llf_rdr[i,:]) )
-        df_posterior.iloc[i,(2*n_clones):(3*n_clones)] = np.exp( single_llf[i,:] - scipy.special.logsumexp(single_llf[i,:]) )
-        df_posterior.iloc[i,(3*n_clones):(4*n_clones)] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )
+        df_posterior.iloc[i, :n_clones] = np.exp(
+            single_llf_baf[i, :] - scipy.special.logsumexp(single_llf_baf[i, :])
+        )
+        df_posterior.iloc[i, n_clones : (2 * n_clones)] = np.exp(
+            single_llf_rdr[i, :] - scipy.special.logsumexp(single_llf_rdr[i, :])
+        )
+        df_posterior.iloc[i, (2 * n_clones) : (3 * n_clones)] = np.exp(
+            single_llf[i, :] - scipy.special.logsumexp(single_llf[i, :])
+        )
+        df_posterior.iloc[i, (3 * n_clones) : (4 * n_clones)] = np.exp(
+            w_node
+            + spatial_weight * w_edge
+            - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
+        )
 
     return df_posterior
diff --git a/src/calicost/hmrf_normalmixture.py b/src/calicost/hmrf_normalmixture.py
index af68580..5c05f0e 100644
--- a/src/calicost/hmrf_normalmixture.py
+++ b/src/calicost/hmrf_normalmixture.py
@@ -15,4 +15,3 @@
 
 import warnings
 from statsmodels.tools.sm_exceptions import ValueWarning
-
diff --git a/src/calicost/joint_allele_generateconfig.py b/src/calicost/joint_allele_generateconfig.py
index 7a16294..9898135 100644
--- a/src/calicost/joint_allele_generateconfig.py
+++ b/src/calicost/joint_allele_generateconfig.py
@@ -19,118 +19,120 @@
 def read_joint_configuration_file(filename):
     ##### [Default settings] #####
     config = {
-        "input_filelist" : None,
-        "snp_dir" : None,
-        "output_dir" : None,
+        "input_filelist": None,
+        "snp_dir": None,
+        "output_dir": None,
         # supporting files and preprocessing arguments
-        "hgtable_file" : None,
-        "normalidx_file" : None,
-        "tumorprop_file" : None,
-        "supervision_clone_file" : None,
-        "alignment_files" : [],
-        "filtergenelist_file" : None,
-        "filterregion_file" : None,
-        "binsize" : 1,
-        "rdrbinsize" : 1,
+        "hgtable_file": None,
+        "normalidx_file": None,
+        "tumorprop_file": None,
+        "supervision_clone_file": None,
+        "alignment_files": [],
+        "filtergenelist_file": None,
+        "filterregion_file": None,
+        "binsize": 1,
+        "rdrbinsize": 1,
         # "secondbinning_min_umi" : 500,
-        "max_nbins" : 1200,
-        "avg_umi_perbinspot" : 1.5,
-        "bafonly" : True,
+        "max_nbins": 1200,
+        "avg_umi_perbinspot": 1.5,
+        "bafonly": True,
         # phase switch probability
-        "nu" : 1,
-        "logphase_shift" : 1,
-        "npart_phasing" : 2,
+        "nu": 1,
+        "logphase_shift": 1,
+        "npart_phasing": 2,
         # HMRF configurations
-        "n_clones" : None,
-        "n_clones_rdr" : 2,
-        "min_spots_per_clone" : 100,
-        "min_avgumi_per_clone" : 10,
-        "maxspots_pooling" : 7,
-        "tumorprop_threshold" : 0.5, 
-        "max_iter_outer" : 20,
-        "nodepotential" : "max", # max or weighted_sum
-        "initialization_method" : "rectangle", # rectangle or datadrive
-        "num_hmrf_initialization_start" : 0, 
-        "num_hmrf_initialization_end" : 10,
-        "spatial_weight" : 2.0,
-        "construct_adjacency_method" : "hexagon",
-        "construct_adjacency_w" : 1.0,
+        "n_clones": None,
+        "n_clones_rdr": 2,
+        "min_spots_per_clone": 100,
+        "min_avgumi_per_clone": 10,
+        "maxspots_pooling": 7,
+        "tumorprop_threshold": 0.5,
+        "max_iter_outer": 20,
+        "nodepotential": "max",  # max or weighted_sum
+        "initialization_method": "rectangle",  # rectangle or datadrive
+        "num_hmrf_initialization_start": 0,
+        "num_hmrf_initialization_end": 10,
+        "spatial_weight": 2.0,
+        "construct_adjacency_method": "hexagon",
+        "construct_adjacency_w": 1.0,
         # HMM configurations
-        "n_states" : None,
-        "params" : None,
-        "t" : None,
-        "t_phaseing" : 1-1e-4,
-        "fix_NB_dispersion" : False,
-        "shared_NB_dispersion" : True,
-        "fix_BB_dispersion" : False,
-        "shared_BB_dispersion" : True,
-        "max_iter" : 30,
-        "tol" : 1e-3,
-        "gmm_random_state" : 0,
-        "np_threshold" : 2.0,
-        "np_eventminlen" : 10
+        "n_states": None,
+        "params": None,
+        "t": None,
+        "t_phaseing": 1 - 1e-4,
+        "fix_NB_dispersion": False,
+        "shared_NB_dispersion": True,
+        "fix_BB_dispersion": False,
+        "shared_BB_dispersion": True,
+        "max_iter": 30,
+        "tol": 1e-3,
+        "gmm_random_state": 0,
+        "np_threshold": 2.0,
+        "np_eventminlen": 10,
     }
 
     argument_type = {
-        "input_filelist" : "str",
-        "snp_dir" : "str",
-        "output_dir" : "str",
+        "input_filelist": "str",
+        "snp_dir": "str",
+        "output_dir": "str",
         # supporting files and preprocessing arguments
-        "hgtable_file" : "str",
-        "normalidx_file" : "str",
-        "tumorprop_file" : "str",
-        "supervision_clone_file" : "str",
-        "alignment_files" : "list_str",
-        "filtergenelist_file" : "str",
-        "filterregion_file" : "str",
-        "binsize" : "int",
-        "rdrbinsize" : "int",
+        "hgtable_file": "str",
+        "normalidx_file": "str",
+        "tumorprop_file": "str",
+        "supervision_clone_file": "str",
+        "alignment_files": "list_str",
+        "filtergenelist_file": "str",
+        "filterregion_file": "str",
+        "binsize": "int",
+        "rdrbinsize": "int",
         # "secondbinning_min_umi" : "int",
-        "max_nbins" : "int",
-        "avg_umi_perbinspot" : "float",
-        "bafonly" : "bool",
+        "max_nbins": "int",
+        "avg_umi_perbinspot": "float",
+        "bafonly": "bool",
         # phase switch probability
-        "nu" : "float",
-        "logphase_shift" : "float",
-        "npart_phasing" : "int",
+        "nu": "float",
+        "logphase_shift": "float",
+        "npart_phasing": "int",
         # HMRF configurations
-        "n_clones" : "int",
-        "n_clones_rdr" : "int",
-        "min_spots_per_clone" : "int",
-        "min_avgumi_per_clone" : "int",
-        "maxspots_pooling" : "int",
-        "tumorprop_threshold" : "float", 
-        "max_iter_outer" : "int",
-        "nodepotential" : "str",
-        "initialization_method" : "str",
-        "num_hmrf_initialization_start" : "int", 
-        "num_hmrf_initialization_end" : "int",
-        "spatial_weight" : "float",
-        "construct_adjacency_method" : "str",
-        "construct_adjacency_w" : "float",
+        "n_clones": "int",
+        "n_clones_rdr": "int",
+        "min_spots_per_clone": "int",
+        "min_avgumi_per_clone": "int",
+        "maxspots_pooling": "int",
+        "tumorprop_threshold": "float",
+        "max_iter_outer": "int",
+        "nodepotential": "str",
+        "initialization_method": "str",
+        "num_hmrf_initialization_start": "int",
+        "num_hmrf_initialization_end": "int",
+        "spatial_weight": "float",
+        "construct_adjacency_method": "str",
+        "construct_adjacency_w": "float",
         # HMM configurations
-        "n_states" : "int",
-        "params" : "str",
-        "t" : "eval",
-        "t_phaseing" : "eval",
-        "fix_NB_dispersion" : "bool",
-        "shared_NB_dispersion" : "bool",
-        "fix_BB_dispersion" : "bool",
-        "shared_BB_dispersion" : "bool",
-        "max_iter" : "int",
-        "tol" : "float",
-        "gmm_random_state" : "int",
-        "np_threshold" : "float",
-        "np_eventminlen" : "int"
+        "n_states": "int",
+        "params": "str",
+        "t": "eval",
+        "t_phaseing": "eval",
+        "fix_NB_dispersion": "bool",
+        "shared_NB_dispersion": "bool",
+        "fix_BB_dispersion": "bool",
+        "shared_BB_dispersion": "bool",
+        "max_iter": "int",
+        "tol": "float",
+        "gmm_random_state": "int",
+        "np_threshold": "float",
+        "np_eventminlen": "int",
     }
 
     ##### [ read configuration file to update settings ] #####
-    with open(filename, 'r') as fp:
+    with open(filename, "r") as fp:
         for line in fp:
             if line.strip() == "" or line[0] == "#":
                 continue
             strs = [x.strip() for x in line.strip().split(":") if x != ""]
-            assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
+            assert (
+                strs[0] in config.keys()
+            ), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
             if len(strs) == 1:
                 config[strs[0]] = []
             elif strs[1].upper() == "NONE":
@@ -144,7 +146,7 @@ def read_joint_configuration_file(filename):
             elif argument_type[strs[0]] == "eval":
                 config[strs[0]] = eval(strs[1])
             elif argument_type[strs[0]] == "bool":
-                config[strs[0]] = (strs[1].upper() == "TRUE")
+                config[strs[0]] = strs[1].upper() == "TRUE"
             elif argument_type[strs[0]] == "list_str":
                 config[strs[0]] = strs[1].split(" ")
     # assertions
@@ -155,12 +157,10 @@ def read_joint_configuration_file(filename):
     return config
 
 
-
 def write_joint_config_file(outputfilename, config):
-    list_argument_io = ["input_filelist",
-        "snp_dir",
-        "output_dir"]
-    list_argument_sup = ["hgtable_file",
+    list_argument_io = ["input_filelist", "snp_dir", "output_dir"]
+    list_argument_sup = [
+        "hgtable_file",
         "normalidx_file",
         "tumorprop_file",
         "supervision_clone_file",
@@ -172,11 +172,11 @@ def write_joint_config_file(outputfilename, config):
         # "secondbinning_min_umi",
         "max_nbins",
         "avg_umi_perbinspot",
-        "bafonly"]
-    list_argument_phase = ["nu",
-        "logphase_shift",
-        "npart_phasing"]
-    list_argument_hmrf = ["n_clones",
+        "bafonly",
+    ]
+    list_argument_phase = ["nu", "logphase_shift", "npart_phasing"]
+    list_argument_hmrf = [
+        "n_clones",
         "n_clones_rdr",
         "min_spots_per_clone",
         "min_avgumi_per_clone",
@@ -185,12 +185,14 @@ def write_joint_config_file(outputfilename, config):
         "max_iter_outer",
         "nodepotential",
         "initialization_method",
-        "num_hmrf_initialization_start", 
+        "num_hmrf_initialization_start",
         "num_hmrf_initialization_end",
         "spatial_weight",
         "construct_adjacency_method",
-        "construct_adjacency_w"]
-    list_argument_hmm = ["n_states",
+        "construct_adjacency_w",
+    ]
+    list_argument_hmm = [
+        "n_states",
         "params",
         "t",
         "t_phaseing",
@@ -202,8 +204,9 @@ def write_joint_config_file(outputfilename, config):
         "tol",
         "gmm_random_state",
         "np_threshold",
-        "np_eventminlen"]
-    with open(outputfilename, 'w') as fp:
+        "np_eventminlen",
+    ]
+    with open(outputfilename, "w") as fp:
         #
         for k in list_argument_io:
             fp.write(f"{k} : {config[k]}\n")
@@ -240,12 +243,14 @@ def main(argv):
     config = read_joint_configuration_file(template_configuration_file)
     for r in range(hmrf_seed_s, hmrf_seed_t):
         config["num_hmrf_initialization_start"] = r
-        config["num_hmrf_initialization_end"] = r+1
+        config["num_hmrf_initialization_end"] = r + 1
         write_joint_config_file(f"{outputdir}/configfile{r}", config)
-    
+
 
 if __name__ == "__main__":
     if len(sys.argv) == 1:
-        print("python joint_allele_generateconfig.py <template_configuration_file> <outputdir> <hmrf_seed_s> <hmrf_seed_t>")
+        print(
+            "python joint_allele_generateconfig.py <template_configuration_file> <outputdir> <hmrf_seed_s> <hmrf_seed_t>"
+        )
     if len(sys.argv) > 1:
-        main(sys.argv)
\ No newline at end of file
+        main(sys.argv)
diff --git a/src/calicost/oldcode.py b/src/calicost/oldcode.py
index 217dc49..88ec5fa 100644
--- a/src/calicost/oldcode.py
+++ b/src/calicost/oldcode.py
@@ -10,8 +10,18 @@
 # M step related
 ############################################################
 
-def update_emission_params_nb_sitewise(X_nb, log_gamma, base_nb_mean, alphas, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+
+def update_emission_params_nb_sitewise(
+    X_nb,
+    log_gamma,
+    base_nb_mean,
+    alphas,
+    start_log_mu=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=False,
+    min_log_rdr=-2,
+    max_log_rdr=2,
+):
     """
     Attributes
     ----------
@@ -32,59 +42,133 @@ def update_emission_params_nb_sitewise(X_nb, log_gamma, base_nb_mean, alphas, \
         new_log_mu = np.zeros((n_states, n_spots))
         new_alphas = alphas
         for s in range(n_spots):
-            idx_nonzero = np.where(base_nb_mean[:,s] > 0)[0]
+            idx_nonzero = np.where(base_nb_mean[:, s] > 0)[0]
             for i in range(n_states):
-                model = sm.GLM(X_nb[idx_nonzero,s], np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \
-                            exposure=base_nb_mean[idx_nonzero,s], var_weights=gamma[i,idx_nonzero]+gamma[i+n_states,idx_nonzero])
+                model = sm.GLM(
+                    X_nb[idx_nonzero, s],
+                    np.ones(len(idx_nonzero)).reshape(-1, 1),
+                    family=sm.families.NegativeBinomial(alpha=alphas[i, s]),
+                    exposure=base_nb_mean[idx_nonzero, s],
+                    var_weights=gamma[i, idx_nonzero]
+                    + gamma[i + n_states, idx_nonzero],
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                 new_log_mu[i, s] = res.params[0]
                 # print(s, i, res.params)
                 if not (start_log_mu is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.array([start_log_mu[i, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_log_mu[i, s] = (
+                        res.params[0]
+                        if -model.loglike(res.params) < -model.loglike(res2.params)
+                        else res2.params[0]
+                    )
     else:
         new_log_mu = np.zeros((n_states, n_spots))
         new_alphas = np.zeros((n_states, n_spots))
         if not shared_NB_dispersion:
             for s in range(n_spots):
-                idx_nonzero = np.where(base_nb_mean[:,s] > 0)[0]
+                idx_nonzero = np.where(base_nb_mean[:, s] > 0)[0]
                 for i in range(n_states):
-                    model = Weighted_NegativeBinomial(X_nb[idx_nonzero,s], \
-                                np.ones(len(idx_nonzero)).reshape(-1,1), \
-                                weights=gamma[i,idx_nonzero]+gamma[i+n_states,idx_nonzero], exposure=base_nb_mean[idx_nonzero,s])
+                    model = Weighted_NegativeBinomial(
+                        X_nb[idx_nonzero, s],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        weights=gamma[i, idx_nonzero]
+                        + gamma[i + n_states, idx_nonzero],
+                        exposure=base_nb_mean[idx_nonzero, s],
+                    )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_log_mu[i, s] = res.params[0]
                     new_alphas[i, s] = res.params[-1]
                     if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.append(
+                                [start_log_mu[i, s]], [alphas[i, s]]
+                            ),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_log_mu[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
+                        new_alphas[i, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
         else:
             for s in range(n_spots):
-                idx_nonzero = np.where(base_nb_mean[:,s] > 0)[0]
-                all_states_nb_mean = np.tile(base_nb_mean[idx_nonzero,s], n_states)
-                all_states_y = np.tile(X_nb[idx_nonzero,s], n_states)
-                all_states_weights = np.concatenate([gamma[i,idx_nonzero]+gamma[i+n_states,idx_nonzero] for i in range(n_states)])
-                all_states_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                idx_nonzero = np.where(base_nb_mean[:, s] > 0)[0]
+                all_states_nb_mean = np.tile(base_nb_mean[idx_nonzero, s], n_states)
+                all_states_y = np.tile(X_nb[idx_nonzero, s], n_states)
+                all_states_weights = np.concatenate(
+                    [
+                        gamma[i, idx_nonzero] + gamma[i + n_states, idx_nonzero]
+                        for i in range(n_states)
+                    ]
+                )
+                all_states_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    all_states_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
-                model = Weighted_NegativeBinomial(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_nb_mean)
+                    all_states_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
+                model = Weighted_NegativeBinomial(
+                    all_states_y,
+                    all_states_features,
+                    weights=all_states_weights,
+                    exposure=all_states_nb_mean,
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                new_log_mu[:,s] = res.params[:-1]
-                new_alphas[:,s] = res.params[-1]
+                new_log_mu[:, s] = res.params[:-1]
+                new_alphas[:, s] = res.params[-1]
                 if not (start_log_mu is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.append(start_log_mu[:,s], [alphas[0,s]]), xtol=1e-4, ftol=1e-4)
-                    new_log_mu[:,s] = res.params[:-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[:-1]
-                    new_alphas[:,s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.append(start_log_mu[:, s], [alphas[0, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_log_mu[:, s] = (
+                        res.params[:-1]
+                        if model.nloglikeobs(res.params)
+                        < model.nloglikeobs(res2.params)
+                        else res2.params[:-1]
+                    )
+                    new_alphas[:, s] = (
+                        res.params[-1]
+                        if model.nloglikeobs(res.params)
+                        < model.nloglikeobs(res2.params)
+                        else res2.params[-1]
+                    )
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
-def update_emission_params_bb_sitewise(X_bb, log_gamma, total_bb_RD, taus, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-    percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
+def update_emission_params_bb_sitewise(
+    X_bb,
+    log_gamma,
+    total_bb_RD,
+    taus,
+    start_p_binom=None,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=False,
+    percent_threshold=0.99,
+    min_binom_prob=0.01,
+    max_binom_prob=0.99,
+):
     """
     Attributes
     ----------
@@ -103,64 +187,160 @@ def update_emission_params_bb_sitewise(X_bb, log_gamma, total_bb_RD, taus, \
     # initialization
     new_p_binom = np.ones((n_states, n_spots)) * 0.5
     new_taus = copy.copy(taus)
-    if fix_BB_dispersion: 
+    if fix_BB_dispersion:
         for s in np.arange(X_bb.shape[1]):
-            idx_nonzero = np.where(total_bb_RD[:,s] > 0)[0]
+            idx_nonzero = np.where(total_bb_RD[:, s] > 0)[0]
             for i in range(n_states):
-                model = Weighted_BetaBinom_fixdispersion(np.append(X_bb[idx_nonzero,s], total_bb_RD[idx_nonzero,s]-X_bb[idx_nonzero,s]), \
-                    np.ones(2*len(idx_nonzero)).reshape(-1,1), \
-                    taus[i,s], \
-                    weights=np.append(gamma[i,idx_nonzero], gamma[i+n_states,idx_nonzero]), \
-                    exposure=np.append(total_bb_RD[idx_nonzero,s], total_bb_RD[idx_nonzero,s]) )
+                model = Weighted_BetaBinom_fixdispersion(
+                    np.append(
+                        X_bb[idx_nonzero, s],
+                        total_bb_RD[idx_nonzero, s] - X_bb[idx_nonzero, s],
+                    ),
+                    np.ones(2 * len(idx_nonzero)).reshape(-1, 1),
+                    taus[i, s],
+                    weights=np.append(
+                        gamma[i, idx_nonzero], gamma[i + n_states, idx_nonzero]
+                    ),
+                    exposure=np.append(
+                        total_bb_RD[idx_nonzero, s], total_bb_RD[idx_nonzero, s]
+                    ),
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                 new_p_binom[i, s] = res.params[0]
                 if not (start_p_binom is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4)
-                    new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.array(start_p_binom[i, s]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_p_binom[i, s] = (
+                        res.params[0]
+                        if model.nloglikeobs(res.params)
+                        < model.nloglikeobs(res2.params)
+                        else res2.params[0]
+                    )
     else:
         if not shared_BB_dispersion:
             for s in np.arange(X_bb.shape[1]):
-                idx_nonzero = np.where(total_bb_RD[:,s] > 0)[0]
+                idx_nonzero = np.where(total_bb_RD[:, s] > 0)[0]
                 for i in range(n_states):
-                    model = Weighted_BetaBinom(np.append(X_bb[idx_nonzero,s], total_bb_RD[idx_nonzero,s]-X_bb[idx_nonzero,s]), \
-                        np.ones(2*len(idx_nonzero)).reshape(-1,1), \
-                        weights=np.append(gamma[i,idx_nonzero], gamma[i+n_states,idx_nonzero]), \
-                        exposure=np.append(total_bb_RD[idx_nonzero,s], total_bb_RD[idx_nonzero,s]) )
+                    model = Weighted_BetaBinom(
+                        np.append(
+                            X_bb[idx_nonzero, s],
+                            total_bb_RD[idx_nonzero, s] - X_bb[idx_nonzero, s],
+                        ),
+                        np.ones(2 * len(idx_nonzero)).reshape(-1, 1),
+                        weights=np.append(
+                            gamma[i, idx_nonzero], gamma[i + n_states, idx_nonzero]
+                        ),
+                        exposure=np.append(
+                            total_bb_RD[idx_nonzero, s], total_bb_RD[idx_nonzero, s]
+                        ),
+                    )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_p_binom[i, s] = res.params[0]
                     new_taus[i, s] = res.params[-1]
                     if not (start_p_binom is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.append([start_p_binom[i, s]], [taus[i, s]]),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_p_binom[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
+                        new_taus[i, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
         else:
             for s in np.arange(X_bb.shape[1]):
-                idx_nonzero = np.where(total_bb_RD[:,s] > 0)[0]
-                all_states_exposure = np.tile( np.append(total_bb_RD[idx_nonzero,s], total_bb_RD[idx_nonzero,s]), n_states)
-                all_states_y = np.tile( np.append(X_bb[idx_nonzero,s], total_bb_RD[idx_nonzero,s]-X_bb[idx_nonzero,s]), n_states)
-                all_states_weights = np.concatenate([ np.append(gamma[i,idx_nonzero], gamma[i+n_states,idx_nonzero]) for i in range(n_states) ])
-                all_states_features = np.zeros((2*n_states*len(idx_nonzero), n_states))
+                idx_nonzero = np.where(total_bb_RD[:, s] > 0)[0]
+                all_states_exposure = np.tile(
+                    np.append(total_bb_RD[idx_nonzero, s], total_bb_RD[idx_nonzero, s]),
+                    n_states,
+                )
+                all_states_y = np.tile(
+                    np.append(
+                        X_bb[idx_nonzero, s],
+                        total_bb_RD[idx_nonzero, s] - X_bb[idx_nonzero, s],
+                    ),
+                    n_states,
+                )
+                all_states_weights = np.concatenate(
+                    [
+                        np.append(
+                            gamma[i, idx_nonzero], gamma[i + n_states, idx_nonzero]
+                        )
+                        for i in range(n_states)
+                    ]
+                )
+                all_states_features = np.zeros(
+                    (2 * n_states * len(idx_nonzero), n_states)
+                )
                 for i in np.arange(n_states):
-                    all_states_features[(i*2*len(idx_nonzero)):((i+1)*2*len(idx_nonzero)), i] = 1
-                model = Weighted_BetaBinom(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_exposure)
+                    all_states_features[
+                        (i * 2 * len(idx_nonzero)) : ((i + 1) * 2 * len(idx_nonzero)), i
+                    ] = 1
+                model = Weighted_BetaBinom(
+                    all_states_y,
+                    all_states_features,
+                    weights=all_states_weights,
+                    exposure=all_states_exposure,
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-                new_p_binom[:,s] = res.params[:-1]
-                new_p_binom[new_p_binom[:,s] < min_binom_prob, s] = min_binom_prob
-                new_p_binom[new_p_binom[:,s] > max_binom_prob, s] = max_binom_prob
+                new_p_binom[:, s] = res.params[:-1]
+                new_p_binom[new_p_binom[:, s] < min_binom_prob, s] = min_binom_prob
+                new_p_binom[new_p_binom[:, s] > max_binom_prob, s] = max_binom_prob
                 if res.params[-1] > 0:
                     new_taus[:, s] = res.params[-1]
                 if not (start_p_binom is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.append(start_p_binom[:,s], [taus[0, s]]), xtol=1e-4, ftol=1e-4)
-                    new_p_binom[:,s] = res.params[:-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[:-1]
-                    new_p_binom[new_p_binom[:,s] < min_binom_prob, s] = min_binom_prob
-                    new_p_binom[new_p_binom[:,s] > max_binom_prob, s] = max_binom_prob
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.append(start_p_binom[:, s], [taus[0, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_p_binom[:, s] = (
+                        res.params[:-1]
+                        if model.nloglikeobs(res.params)
+                        < model.nloglikeobs(res2.params)
+                        else res2.params[:-1]
+                    )
+                    new_p_binom[new_p_binom[:, s] < min_binom_prob, s] = min_binom_prob
+                    new_p_binom[new_p_binom[:, s] > max_binom_prob, s] = max_binom_prob
                     if res2.params[-1] > 0:
-                        new_taus[:,s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        new_taus[:, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
     return new_p_binom, new_taus
 
 
-
-def hmrf_log_likelihood(nodepotential, single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, assignment, spatial_weight):
+def hmrf_log_likelihood(
+    nodepotential,
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    res,
+    pred,
+    smooth_mat,
+    adjacency_mat,
+    assignment,
+    spatial_weight,
+):
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_p_binom"].shape[1]
@@ -168,40 +348,98 @@ def hmrf_log_likelihood(nodepotential, single_X, single_base_nb_mean, single_tot
     single_llf = np.zeros((N, n_clones))
     #
     for i in trange(N):
-        idx = smooth_mat[i,:].nonzero()[1] # smooth_mat can be identity matrix
+        idx = smooth_mat[i, :].nonzero()[1]  # smooth_mat can be identity matrix
         for c in range(n_clones):
-            tmp_log_emission_rdr, tmp_log_emission_baf = compute_emission_probability_nb_betabinom_phaseswitch( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
-                np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \
-                np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)])
+            tmp_log_emission_rdr, tmp_log_emission_baf = (
+                compute_emission_probability_nb_betabinom_phaseswitch(
+                    np.sum(single_X[:, :, idx], axis=2, keepdims=True),
+                    np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True),
+                    res["new_log_mu"][:, c : (c + 1)],
+                    res["new_alphas"][:, c : (c + 1)],
+                    np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True),
+                    res["new_p_binom"][:, c : (c + 1)],
+                    res["new_taus"][:, c : (c + 1)],
+                )
+            )
             #
             if nodepotential == "weighted_sum":
-                if np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) > 0 and np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) > 0:
-                    ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                    single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:, 0] + res["log_gamma"][:,:,c], axis=0) ) + np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:, 0] + res["log_gamma"][:,:,c], axis=0) )
+                if (
+                    np.sum(np.sum(single_base_nb_mean[:, idx], axis=1) > 0) > 0
+                    and np.sum(np.sum(single_total_bb_RD[:, idx], axis=1) > 0) > 0
+                ):
+                    ratio_nonzeros = (
+                        1.0
+                        * np.sum(np.sum(single_total_bb_RD[:, idx], axis=1) > 0)
+                        / np.sum(np.sum(single_base_nb_mean[:, idx], axis=1) > 0)
+                    )
+                    single_llf[i, c] = ratio_nonzeros * np.sum(
+                        scipy.special.logsumexp(
+                            tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
+                            axis=0,
+                        )
+                    ) + np.sum(
+                        scipy.special.logsumexp(
+                            tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c],
+                            axis=0,
+                        )
+                    )
                 else:
-                    single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) )
+                    single_llf[i, c] = np.sum(
+                        scipy.special.logsumexp(
+                            tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
+                            axis=0,
+                        )
+                    ) + np.sum(
+                        scipy.special.logsumexp(
+                            tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c],
+                            axis=0,
+                        )
+                    )
             else:
-                if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0:
-                    ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
-                    single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
+                if (
+                    np.sum(single_base_nb_mean[:, idx] > 0) > 0
+                    and np.sum(single_total_bb_RD[:, idx] > 0) > 0
+                ):
+                    ratio_nonzeros = (
+                        1.0
+                        * np.sum(np.sum(single_total_bb_RD[:, idx], axis=1) > 0)
+                        / np.sum(np.sum(single_base_nb_mean[:, idx], axis=1) > 0)
+                    )
+                    single_llf[i, c] = ratio_nonzeros * np.sum(
+                        tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                    ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0])
                 else:
-                    single_llf[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0])
+                    single_llf[i, c] = np.sum(
+                        tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0]
+                    ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0])
     #
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(assignment[adjacency_mat[i,:].nonzero()[1]] == assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(assignment[adjacency_mat[i, :].nonzero()[1]] == assignment[i])
+        )
     return total_llf
 
 
-def hmrf_reassignment_compositehmm(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, adjacency_mat, prev_assignment, spatial_weight):
+def hmrf_reassignment_compositehmm(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    res,
+    pred,
+    adjacency_mat,
+    prev_assignment,
+    spatial_weight,
+):
     # basic dimension info
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = np.max(prev_assignment) + 1
     n_individual_states = int(len(res["new_p_binom"]) / 2.0)
     n_composite_states = int(len(res["state_tuples"]) / 2.0)
-    
+
     # initialize result vector
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
@@ -209,153 +447,350 @@ def hmrf_reassignment_compositehmm(single_X, single_base_nb_mean, single_total_b
     # re-assign by HMRF
     for i in trange(N):
         # log emission probability of each composite state, matrix size (2*n_composite_states, n_obs)
-        tmp_log_emission = compute_emission_probability_nb_betabinom_composite(single_X[:,:,i:(i+1)], res["state_tuples"], \
-            single_base_nb_mean[:,i:(i+1)], res["new_log_mu"], res["new_alphas"], single_total_bb_RD[:,i:(i+1)], \
-            res["new_p_binom"], res["new_taus"], res["new_scalefactors"])
+        tmp_log_emission = compute_emission_probability_nb_betabinom_composite(
+            single_X[:, :, i : (i + 1)],
+            res["state_tuples"],
+            single_base_nb_mean[:, i : (i + 1)],
+            res["new_log_mu"],
+            res["new_alphas"],
+            single_total_bb_RD[:, i : (i + 1)],
+            res["new_p_binom"],
+            res["new_taus"],
+            res["new_scalefactors"],
+        )
         for c in range(n_clones):
-            single_llf[i,c] = np.sum(tmp_log_emission[pred[(c*n_obs):(c*n_obs+n_obs)], np.arange(n_obs)])
+            single_llf[i, c] = np.sum(
+                tmp_log_emission[
+                    pred[(c * n_obs) : (c * n_obs + n_obs)], np.arange(n_obs)
+                ]
+            )
         # node potential
-        w_node = single_llf[i,:]
+        w_node = single_llf[i, :]
         # edge potential
         w_edge = np.zeros(n_clones)
-        for j in adjacency_mat[i,:].nonzero()[1]:
+        for j in adjacency_mat[i, :].nonzero()[1]:
             # w_edge[new_assignment[j]] += 1
-            w_edge[new_assignment[j]] += adjacency_mat[i,j]
+            w_edge[new_assignment[j]] += adjacency_mat[i, j]
         # combine both potential for the new assignment
-        new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
-    
+        new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
+
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
     for i in range(N):
-        total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )
+        total_llf += np.sum(
+            spatial_weight
+            * np.sum(
+                new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
+            )
+        )
     return new_assignment, single_llf, total_llf
 
 
-
 def allele_starch_combine_clones():
-    res_combine = {"new_assignment":np.zeros(single_X.shape[2], dtype=int)}
+    res_combine = {"new_assignment": np.zeros(single_X.shape[2], dtype=int)}
     offset_clone = 0
     for bafc in range(n_baf_clones):
         prefix = f"clone{bafc}"
-        allres = dict( np.load(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", allow_pickle=True) )
+        allres = dict(
+            np.load(
+                f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz",
+                allow_pickle=True,
+            )
+        )
         r = allres["num_iterations"] - 1
-        res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-            "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-            "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-            "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-            "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
-        idx_spots = np.where(adata.obs.index.isin( allres["barcodes"] ))[0]
+        res = {
+            "new_log_mu": allres[f"round{r}_new_log_mu"],
+            "new_alphas": allres[f"round{r}_new_alphas"],
+            "new_p_binom": allres[f"round{r}_new_p_binom"],
+            "new_taus": allres[f"round{r}_new_taus"],
+            "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+            "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+            "log_gamma": allres[f"round{r}_log_gamma"],
+            "pred_cnv": allres[f"round{r}_pred_cnv"],
+            "llf": allres[f"round{r}_llf"],
+            "total_llf": allres[f"round{r}_total_llf"],
+            "prev_assignment": allres[f"round{r-1}_assignment"],
+            "new_assignment": allres[f"round{r}_assignment"],
+        }
+        idx_spots = np.where(adata.obs.index.isin(allres["barcodes"]))[0]
         n_obs = single_X.shape[0]
         if len(np.unique(res["new_assignment"])) == 1:
             n_merged_clones = 1
             c = res["new_assignment"][0]
             merged_res = copy.copy(res)
             merged_res["new_assignment"] = np.zeros(len(idx_spots), dtype=int)
-            log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((2*config["n_states"], n_obs, 1))
-            pred_cnv = res["pred_cnv"][ (c*n_obs):(c*n_obs+n_obs) ].reshape((-1,1))
+            log_gamma = res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)].reshape(
+                (2 * config["n_states"], n_obs, 1)
+            )
+            pred_cnv = res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)].reshape(
+                (-1, 1)
+            )
         else:
             if config["tumorprop_file"] is None:
-                X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(n_clones_rdr)])
+                X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                    single_X[:, :, idx_spots],
+                    single_base_nb_mean[:, idx_spots],
+                    single_total_bb_RD[:, idx_spots],
+                    [
+                        np.where(res["new_assignment"] == c)[0]
+                        for c in range(n_clones_rdr)
+                    ],
+                )
                 tumor_prop = None
             else:
-                X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(n_clones_rdr)], single_tumor_prop[idx_spots])
-            merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, params="smp", tumor_prop=tumor_prop)
+                X, base_nb_mean, total_bb_RD, tumor_prop = (
+                    merge_pseudobulk_by_index_mix(
+                        single_X[:, :, idx_spots],
+                        single_base_nb_mean[:, idx_spots],
+                        single_total_bb_RD[:, idx_spots],
+                        [
+                            np.where(res["new_assignment"] == c)[0]
+                            for c in range(n_clones_rdr)
+                        ],
+                        single_tumor_prop[idx_spots],
+                    )
+                )
+            merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(
+                X, base_nb_mean, total_bb_RD, res, params="smp", tumor_prop=tumor_prop
+            )
             print(f"part {bafc} merging_groups: {merging_groups}")
             #
             if config["tumorprop_file"] is None:
-                merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], res, min_spots_thresholds=50)
+                merging_groups, merged_res = merge_by_minspots(
+                    merged_res["new_assignment"], res, min_spots_thresholds=50
+                )
             else:
-                merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], res, min_spots_thresholds=50, single_tumor_prop=single_tumor_prop[idx_spots])
+                merging_groups, merged_res = merge_by_minspots(
+                    merged_res["new_assignment"],
+                    res,
+                    min_spots_thresholds=50,
+                    single_tumor_prop=single_tumor_prop[idx_spots],
+                )
             # compute posterior using the newly merged pseudobulk
             n_merged_clones = len(merging_groups)
             tmp = copy.copy(merged_res["new_assignment"])
             if config["tumorprop_file"] is None:
-                X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)])
+                X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                    single_X[:, :, idx_spots],
+                    single_base_nb_mean[:, idx_spots],
+                    single_total_bb_RD[:, idx_spots],
+                    [
+                        np.where(merged_res["new_assignment"] == c)[0]
+                        for c in range(n_merged_clones)
+                    ],
+                )
                 tumor_prop = None
             else:
-                X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)], single_tumor_prop[idx_spots])
+                X, base_nb_mean, total_bb_RD, tumor_prop = (
+                    merge_pseudobulk_by_index_mix(
+                        single_X[:, :, idx_spots],
+                        single_base_nb_mean[:, idx_spots],
+                        single_total_bb_RD[:, idx_spots],
+                        [
+                            np.where(merged_res["new_assignment"] == c)[0]
+                            for c in range(n_merged_clones)
+                        ],
+                        single_tumor_prop[idx_spots],
+                    )
+                )
             #
-            merged_res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), config["n_states"], \
-                    base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1),  np.tile(log_sitewise_transmat, X.shape[2]), tumor_prop, params="smp", t=config["t"], random_state=config["gmm_random_state"], \
-                    fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
-                    is_diag=True, init_log_mu=res["new_log_mu"], init_p_binom=res["new_p_binom"], init_alphas=res["new_alphas"], init_taus=res["new_taus"], max_iter=config["max_iter"], tol=config["tol"])
+            merged_res = pipeline_baum_welch(
+                None,
+                np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
+                    -1, 2, 1
+                ),
+                np.tile(lengths, X.shape[2]),
+                config["n_states"],
+                base_nb_mean.flatten("F").reshape(-1, 1),
+                total_bb_RD.flatten("F").reshape(-1, 1),
+                np.tile(log_sitewise_transmat, X.shape[2]),
+                tumor_prop,
+                params="smp",
+                t=config["t"],
+                random_state=config["gmm_random_state"],
+                fix_NB_dispersion=config["fix_NB_dispersion"],
+                shared_NB_dispersion=config["shared_NB_dispersion"],
+                fix_BB_dispersion=config["fix_BB_dispersion"],
+                shared_BB_dispersion=config["shared_BB_dispersion"],
+                is_diag=True,
+                init_log_mu=res["new_log_mu"],
+                init_p_binom=res["new_p_binom"],
+                init_alphas=res["new_alphas"],
+                init_taus=res["new_taus"],
+                max_iter=config["max_iter"],
+                tol=config["tol"],
+            )
             merged_res["new_assignment"] = copy.copy(tmp)
-            log_gamma = np.stack([ merged_res["log_gamma"][:,(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ], axis=-1)
-            pred_cnv = np.vstack([ merged_res["pred_cnv"][(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ]).T
+            log_gamma = np.stack(
+                [
+                    merged_res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)]
+                    for c in range(n_merged_clones)
+                ],
+                axis=-1,
+            )
+            pred_cnv = np.vstack(
+                [
+                    merged_res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)]
+                    for c in range(n_merged_clones)
+                ]
+            ).T
 
         # add to res_combine
         if len(res_combine) == 1:
-            res_combine.update({"new_log_mu":np.hstack([ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([ merged_res["new_alphas"] ] * n_merged_clones), \
-                "new_p_binom":np.hstack([ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([ merged_res["new_taus"] ] * n_merged_clones), \
-                "log_gamma":log_gamma, "pred_cnv":pred_cnv})
+            res_combine.update(
+                {
+                    "new_log_mu": np.hstack(
+                        [merged_res["new_log_mu"]] * n_merged_clones
+                    ),
+                    "new_alphas": np.hstack(
+                        [merged_res["new_alphas"]] * n_merged_clones
+                    ),
+                    "new_p_binom": np.hstack(
+                        [merged_res["new_p_binom"]] * n_merged_clones
+                    ),
+                    "new_taus": np.hstack([merged_res["new_taus"]] * n_merged_clones),
+                    "log_gamma": log_gamma,
+                    "pred_cnv": pred_cnv,
+                }
+            )
         else:
-            res_combine.update({"new_log_mu":np.hstack([res_combine["new_log_mu"]] + [ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([res_combine["new_alphas"]] + [ merged_res["new_alphas"] ] * n_merged_clones), \
-                "new_p_binom":np.hstack([res_combine["new_p_binom"]] + [ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([res_combine["new_taus"]] + [ merged_res["new_taus"] ] * n_merged_clones), \
-                "log_gamma":np.dstack([res_combine["log_gamma"], log_gamma ]), "pred_cnv":np.hstack([res_combine["pred_cnv"], pred_cnv])})
-        res_combine["new_assignment"][idx_spots] = merged_res["new_assignment"] + offset_clone
+            res_combine.update(
+                {
+                    "new_log_mu": np.hstack(
+                        [res_combine["new_log_mu"]]
+                        + [merged_res["new_log_mu"]] * n_merged_clones
+                    ),
+                    "new_alphas": np.hstack(
+                        [res_combine["new_alphas"]]
+                        + [merged_res["new_alphas"]] * n_merged_clones
+                    ),
+                    "new_p_binom": np.hstack(
+                        [res_combine["new_p_binom"]]
+                        + [merged_res["new_p_binom"]] * n_merged_clones
+                    ),
+                    "new_taus": np.hstack(
+                        [res_combine["new_taus"]]
+                        + [merged_res["new_taus"]] * n_merged_clones
+                    ),
+                    "log_gamma": np.dstack([res_combine["log_gamma"], log_gamma]),
+                    "pred_cnv": np.hstack([res_combine["pred_cnv"], pred_cnv]),
+                }
+            )
+        res_combine["new_assignment"][idx_spots] = (
+            merged_res["new_assignment"] + offset_clone
+        )
         offset_clone += n_merged_clones
     # compute HMRF log likelihood
-    total_llf = hmrf_log_likelihood(config["nodepotential"], single_X, single_base_nb_mean, single_total_bb_RD, res_combine, np.argmax(res_combine["log_gamma"],axis=0), smooth_mat, adjacency_mat, res_combine["new_assignment"], config["spatial_weight"])
+    total_llf = hmrf_log_likelihood(
+        config["nodepotential"],
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        res_combine,
+        np.argmax(res_combine["log_gamma"], axis=0),
+        smooth_mat,
+        adjacency_mat,
+        res_combine["new_assignment"],
+        config["spatial_weight"],
+    )
     res_combine["total_llf"] = total_llf
     # save results
-    np.savez(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine)
-
+    np.savez(
+        f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine
+    )
 
 
 def simplify_parameters(res, params="smp", bafthreshold=0.05, rdrthreshold=0.1):
     n_states = res["new_p_binom"].shape[0]
     G = nx.Graph()
-    G.add_nodes_from( np.arange(n_states) )
-    mAF = np.where(res["new_p_binom"].flatten() < 0.5, res["new_p_binom"].flatten(), 1-res["new_p_binom"].flatten())
+    G.add_nodes_from(np.arange(n_states))
+    mAF = np.where(
+        res["new_p_binom"].flatten() < 0.5,
+        res["new_p_binom"].flatten(),
+        1 - res["new_p_binom"].flatten(),
+    )
     if "m" in params and "p" in params:
-        tmp_edge_graph = (np.abs( res["new_log_mu"].flatten().reshape(-1,1) - res["new_log_mu"].flatten().reshape(1,-1) ) < rdrthreshold) & (np.abs( mAF.reshape(-1,1) - mAF.reshape(1,-1) ) < bafthreshold)
+        tmp_edge_graph = (
+            np.abs(
+                res["new_log_mu"].flatten().reshape(-1, 1)
+                - res["new_log_mu"].flatten().reshape(1, -1)
+            )
+            < rdrthreshold
+        ) & (np.abs(mAF.reshape(-1, 1) - mAF.reshape(1, -1)) < bafthreshold)
     elif "m" in params:
-        tmp_edge_graph = (np.abs( res["new_log_mu"].flatten().reshape(-1,1) - res["new_log_mu"].flatten().reshape(1,-1) ) < rdrthreshold)
+        tmp_edge_graph = (
+            np.abs(
+                res["new_log_mu"].flatten().reshape(-1, 1)
+                - res["new_log_mu"].flatten().reshape(1, -1)
+            )
+            < rdrthreshold
+        )
     else:
-        tmp_edge_graph = (np.abs( mAF.reshape(-1,1) - mAF.reshape(1,-1) ) < bafthreshold)
-    G.add_edges_from([ (i,j) for i in range(tmp_edge_graph.shape[0]) for j in range(tmp_edge_graph.shape[1]) if tmp_edge_graph[i,j] ])
+        tmp_edge_graph = np.abs(mAF.reshape(-1, 1) - mAF.reshape(1, -1)) < bafthreshold
+    G.add_edges_from(
+        [
+            (i, j)
+            for i in range(tmp_edge_graph.shape[0])
+            for j in range(tmp_edge_graph.shape[1])
+            if tmp_edge_graph[i, j]
+        ]
+    )
     # maximal cliques
     cliques = []
     for x in nx.find_cliques(G):
         this_len = len(x)
-        cliques.append( (x, this_len) )
-    cliques.sort(key = lambda x:(-x[1]) )
+        cliques.append((x, this_len))
+    cliques.sort(key=lambda x: (-x[1]))
     covered_states = set()
     merging_state_groups = []
     for c in cliques:
         if len(set(c[0]) & covered_states) == 0:
-            merging_state_groups.append( list(c[0]) )
+            merging_state_groups.append(list(c[0]))
             covered_states = covered_states | set(c[0])
     for c in range(n_states):
         if not (c in covered_states):
-            merging_state_groups.append( [c] )
+            merging_state_groups.append([c])
             covered_states.add(c)
-    merging_state_groups.sort(key = lambda x:np.min(x))
+    merging_state_groups.sort(key=lambda x: np.min(x))
     # merged parameters
-    simplied_res = {"new_log_mu":np.array([ np.mean(res["new_log_mu"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1), \
-        "new_p_binom":np.array([ np.mean(res["new_p_binom"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1), \
-        "new_alphas":np.array([ np.mean(res["new_alphas"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1), \
-        "new_taus":np.array([ np.mean(res["new_taus"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1)}
+    simplied_res = {
+        "new_log_mu": np.array(
+            [np.mean(res["new_log_mu"].flatten()[idx]) for idx in merging_state_groups]
+        ).reshape(-1, 1),
+        "new_p_binom": np.array(
+            [np.mean(res["new_p_binom"].flatten()[idx]) for idx in merging_state_groups]
+        ).reshape(-1, 1),
+        "new_alphas": np.array(
+            [np.mean(res["new_alphas"].flatten()[idx]) for idx in merging_state_groups]
+        ).reshape(-1, 1),
+        "new_taus": np.array(
+            [np.mean(res["new_taus"].flatten()[idx]) for idx in merging_state_groups]
+        ).reshape(-1, 1),
+    }
     return simplied_res
 
 
 def similarity_components_baf(baf_profiles, res, topk=10, threshold=0.05):
     n_clones = baf_profiles.shape[0]
-    adj_baf_profiles = np.where(baf_profiles > 0.5, 1-baf_profiles, baf_profiles)
+    adj_baf_profiles = np.where(baf_profiles > 0.5, 1 - baf_profiles, baf_profiles)
     G = nx.Graph()
-    G.add_nodes_from( np.arange(n_clones) )
+    G.add_nodes_from(np.arange(n_clones))
     for c1 in range(n_clones):
-        for c2 in range(c1+1, n_clones):
-            diff = np.sort(np.abs(baf_profiles[c1,:] - baf_profiles[c2,:]))[::-1][topk]
-            adj_diff = np.sort(np.abs(adj_baf_profiles[c1,:] - adj_baf_profiles[c2,:]))[::-1][topk]
-            if diff < 2*threshold and adj_diff < threshold:
+        for c2 in range(c1 + 1, n_clones):
+            diff = np.sort(np.abs(baf_profiles[c1, :] - baf_profiles[c2, :]))[::-1][
+                topk
+            ]
+            adj_diff = np.sort(
+                np.abs(adj_baf_profiles[c1, :] - adj_baf_profiles[c2, :])
+            )[::-1][topk]
+            if diff < 2 * threshold and adj_diff < threshold:
                 G.add_edge(c1, c2)
                 print(c1, c2, diff)
     merging_groups = [cc for cc in nx.connected_components(G)]
-    merging_groups.sort(key = lambda x:np.min(x))
+    merging_groups.sort(key=lambda x: np.min(x))
     # clone assignment after merging
     map_clone_id = {}
-    for i,x in enumerate(merging_groups):
+    for i, x in enumerate(merging_groups):
         for z in x:
             map_clone_id[z] = i
     new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
@@ -365,24 +800,36 @@ def similarity_components_baf(baf_profiles, res, topk=10, threshold=0.05):
     return merging_groups, merged_res
 
 
-def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.1):
-# def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.15):
+def similarity_components_rdrbaf(
+    baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.1
+):
+    # def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.15):
     n_clones = baf_profiles.shape[0]
-    adj_baf_profiles = np.where(baf_profiles > 0.5, 1-baf_profiles, baf_profiles)
+    adj_baf_profiles = np.where(baf_profiles > 0.5, 1 - baf_profiles, baf_profiles)
     G = nx.Graph()
-    G.add_nodes_from( np.arange(n_clones) )
+    G.add_nodes_from(np.arange(n_clones))
     for c1 in range(n_clones):
-        for c2 in range(c1+1, n_clones):
-            baf_diff = np.sort(np.abs(baf_profiles[c1,:] - baf_profiles[c2,:]))[::-1][topk]
-            baf_adj_diff = np.sort(np.abs(adj_baf_profiles[c1,:] - adj_baf_profiles[c2,:]))[::-1][topk]
-            rdr_diff = np.sort(np.abs(rdr_profiles[c1,:] - rdr_profiles[c2,:]))[::-1][topk]
-            if baf_diff < 2*bafthreshold and baf_adj_diff < bafthreshold and rdr_diff < rdrthreshold:
+        for c2 in range(c1 + 1, n_clones):
+            baf_diff = np.sort(np.abs(baf_profiles[c1, :] - baf_profiles[c2, :]))[::-1][
+                topk
+            ]
+            baf_adj_diff = np.sort(
+                np.abs(adj_baf_profiles[c1, :] - adj_baf_profiles[c2, :])
+            )[::-1][topk]
+            rdr_diff = np.sort(np.abs(rdr_profiles[c1, :] - rdr_profiles[c2, :]))[::-1][
+                topk
+            ]
+            if (
+                baf_diff < 2 * bafthreshold
+                and baf_adj_diff < bafthreshold
+                and rdr_diff < rdrthreshold
+            ):
                 G.add_edge(c1, c2)
     merging_groups = [cc for cc in nx.connected_components(G)]
-    merging_groups.sort(key = lambda x:np.min(x))
+    merging_groups.sort(key=lambda x: np.min(x))
     # clone assignment after merging
     map_clone_id = {}
-    for i,x in enumerate(merging_groups):
+    for i, x in enumerate(merging_groups):
         for z in x:
             map_clone_id[z] = i
     new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
@@ -392,81 +839,166 @@ def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafth
     return merging_groups, merged_res
 
 
-def initialization_rdr_bybaf(n_states, X, base_nb_mean, total_bb_RD, params, prior_p_binom, random_state=None, in_log_space=True):
-    tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=random_state, in_log_space=in_log_space, min_binom_prob=0, max_binom_prob=1)
+def initialization_rdr_bybaf(
+    n_states,
+    X,
+    base_nb_mean,
+    total_bb_RD,
+    params,
+    prior_p_binom,
+    random_state=None,
+    in_log_space=True,
+):
+    tmp_log_mu, tmp_p_binom = initialization_by_gmm(
+        n_states,
+        X,
+        base_nb_mean,
+        total_bb_RD,
+        params,
+        random_state=random_state,
+        in_log_space=in_log_space,
+        min_binom_prob=0,
+        max_binom_prob=1,
+    )
     prior_log_mu = np.zeros(prior_p_binom.shape)
-    for i,x in enumerate(prior_p_binom):
-        idx_nearest = np.argmin( scipy.spatial.distance.cdist(x.reshape(-1,1), tmp_p_binom) )
+    for i, x in enumerate(prior_p_binom):
+        idx_nearest = np.argmin(
+            scipy.spatial.distance.cdist(x.reshape(-1, 1), tmp_p_binom)
+        )
         prior_log_mu[i] = tmp_log_mu[idx_nearest]
     return prior_log_mu
 
 
-
 def output_integer_CN():
     ##### infer integer copy #####
-    res_combine = dict(np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True))
+    res_combine = dict(
+        np.load(
+            f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+            allow_pickle=True,
+        )
+    )
     n_final_clone = len(np.unique(res_combine["new_assignment"]))
     medfix = ["", "_diploid", "_triploid", "_tetraploid"]
-    for o,max_medploidy in enumerate([None, 2, 3, 4]):
+    for o, max_medploidy in enumerate([None, 2, 3, 4]):
         # A/B copy number per bin
         A_copy = np.zeros((n_final_clone, n_obs), dtype=int)
         B_copy = np.zeros((n_final_clone, n_obs), dtype=int)
         # A/B copy number per state
-        state_A_copy = np.zeros((n_final_clone, config['n_states']), dtype=int)
-        state_B_copy = np.zeros((n_final_clone, config['n_states']), dtype=int)
+        state_A_copy = np.zeros((n_final_clone, config["n_states"]), dtype=int)
+        state_B_copy = np.zeros((n_final_clone, config["n_states"]), dtype=int)
 
         df_genelevel_cnv = None
         if config["tumorprop_file"] is None:
-            X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==c)[0] for c in range(n_final_clone)])
+            X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                [
+                    np.where(res_combine["new_assignment"] == c)[0]
+                    for c in range(n_final_clone)
+                ],
+            )
         else:
-            X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==c)[0] for c in range(n_final_clone)], single_tumor_prop)
+            X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+                single_X,
+                single_base_nb_mean,
+                single_total_bb_RD,
+                [
+                    np.where(res_combine["new_assignment"] == c)[0]
+                    for c in range(n_final_clone)
+                ],
+                single_tumor_prop,
+            )
 
         for s in range(n_final_clone):
             # adjust log_mu such that sum_bin lambda * np.exp(log_mu) = 1
-            lambd = base_nb_mean[:,s] / np.sum(base_nb_mean[:,s])
-            this_pred_cnv = res_combine["pred_cnv"][:,s]
-            adjusted_log_mu = np.log( np.exp(res_combine["new_log_mu"][:,s]) / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv,s]) * lambd) )
+            lambd = base_nb_mean[:, s] / np.sum(base_nb_mean[:, s])
+            this_pred_cnv = res_combine["pred_cnv"][:, s]
+            adjusted_log_mu = np.log(
+                np.exp(res_combine["new_log_mu"][:, s])
+                / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv, s]) * lambd)
+            )
             if not max_medploidy is None:
-                best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, max_medploidy=max_medploidy)
+                best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(
+                    adjusted_log_mu,
+                    base_nb_mean[:, s],
+                    res_combine["new_p_binom"][:, s],
+                    this_pred_cnv,
+                    max_medploidy=max_medploidy,
+                )
             else:
-                best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv)
-            print(f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}")
-            
-            A_copy[s,:] = best_integer_copies[res_combine["pred_cnv"][:,s], 0]
-            B_copy[s,:] = best_integer_copies[res_combine["pred_cnv"][:,s], 1]
-            state_A_copy[s,:] = best_integer_copies[:,0]
-            state_B_copy[s,:] = best_integer_copies[:,1]
-            tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list)
+                best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(
+                    adjusted_log_mu,
+                    base_nb_mean[:, s],
+                    res_combine["new_p_binom"][:, s],
+                    this_pred_cnv,
+                )
+            print(
+                f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}"
+            )
+
+            A_copy[s, :] = best_integer_copies[res_combine["pred_cnv"][:, s], 0]
+            B_copy[s, :] = best_integer_copies[res_combine["pred_cnv"][:, s], 1]
+            state_A_copy[s, :] = best_integer_copies[:, 0]
+            state_B_copy[s, :] = best_integer_copies[:, 1]
+            tmpdf = get_genelevel_cnv_oneclone(
+                best_integer_copies[res_combine["pred_cnv"][:, s], 0],
+                best_integer_copies[res_combine["pred_cnv"][:, s], 1],
+                x_gene_list,
+            )
             tmpdf.columns = [f"clone{s} A", f"clone{s} B"]
             if df_genelevel_cnv is None:
                 df_genelevel_cnv = copy.copy(tmpdf)
             else:
                 df_genelevel_cnv = df_genelevel_cnv.join(tmpdf)
         # output gene-level copy number
-        df_genelevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t")
+        df_genelevel_cnv.to_csv(
+            f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t"
+        )
         # output segment-level copy number
-        df_seglevel_cnv = pd.DataFrame({"CHR":[x[0] for x in sorted_chr_pos], "START":[x[1] for x in sorted_chr_pos], \
-            "END":[ (sorted_chr_pos[i+1][1] if i+1 < len(sorted_chr_pos) and x[0]==sorted_chr_pos[i+1][0] else -1) for i,x in enumerate(sorted_chr_pos)] })
+        df_seglevel_cnv = pd.DataFrame(
+            {
+                "CHR": [x[0] for x in sorted_chr_pos],
+                "START": [x[1] for x in sorted_chr_pos],
+                "END": [
+                    (
+                        sorted_chr_pos[i + 1][1]
+                        if i + 1 < len(sorted_chr_pos)
+                        and x[0] == sorted_chr_pos[i + 1][0]
+                        else -1
+                    )
+                    for i, x in enumerate(sorted_chr_pos)
+                ],
+            }
+        )
         for s in range(n_final_clone):
-            df_seglevel_cnv[f"clone{s} A"] = A_copy[s,:]
-            df_seglevel_cnv[f"clone{s} B"] = B_copy[s,:]
-        df_seglevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t")
+            df_seglevel_cnv[f"clone{s} A"] = A_copy[s, :]
+            df_seglevel_cnv[f"clone{s} B"] = B_copy[s, :]
+        df_seglevel_cnv.to_csv(
+            f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t"
+        )
         # output per-state copy number
         df_state_cnv = {}
         for s in range(n_final_clone):
-            df_state_cnv[f"clone{s} logmu"] = res_combine["new_log_mu"][:,s]
-            df_state_cnv[f"clone{s} p"] = res_combine["new_p_binom"][:,s]
-            df_state_cnv[f"clone{s} A"] = state_A_copy[s,:]
-            df_state_cnv[f"clone{s} B"] = state_B_copy[s,:]
+            df_state_cnv[f"clone{s} logmu"] = res_combine["new_log_mu"][:, s]
+            df_state_cnv[f"clone{s} p"] = res_combine["new_p_binom"][:, s]
+            df_state_cnv[f"clone{s} A"] = state_A_copy[s, :]
+            df_state_cnv[f"clone{s} B"] = state_B_copy[s, :]
         df_state_cnv = pd.DataFrame.from_dict(df_state_cnv)
-        df_state_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t")
-    
+        df_state_cnv.to_csv(
+            f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t"
+        )
+
     ##### output clone label #####
     adata.obs["clone_label"] = res_combine["new_assignment"]
     if config["tumorprop_file"] is None:
-        adata.obs[["clone_label"]].to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t")
+        adata.obs[["clone_label"]].to_csv(
+            f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t"
+        )
     else:
-        adata.obs[["tumor_proportion", "clone_label"]].to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t")
+        adata.obs[["tumor_proportion", "clone_label"]].to_csv(
+            f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t"
+        )
 
 
 def set_bin_exp_to_zero():
@@ -475,12 +1007,18 @@ def set_bin_exp_to_zero():
     N_STEP = 2
     multi_step_smooth = copy.copy(smooth_mat)
     for _ in range(N_STEP):
-        multi_step_smooth = (multi_step_smooth + multi_step_smooth @ smooth_mat)
+        multi_step_smooth = multi_step_smooth + multi_step_smooth @ smooth_mat
     multi_step_smooth = (multi_step_smooth > 0).astype(int)
-    rdr = (copy_single_X_rdr @ multi_step_smooth) / (copy_single_base_nb_mean @ multi_step_smooth)
-    rdr[np.sum(copy_single_base_nb_mean,axis=1) == 0] = 0
-    bidx_inconfident = np.where(~np.all(rdr <= MAX_RDR, axis=1))[0] 
+    rdr = (copy_single_X_rdr @ multi_step_smooth) / (
+        copy_single_base_nb_mean @ multi_step_smooth
+    )
+    rdr[np.sum(copy_single_base_nb_mean, axis=1) == 0] = 0
+    bidx_inconfident = np.where(~np.all(rdr <= MAX_RDR, axis=1))[0]
     rdr_normal[bidx_inconfident] = 0
     rdr_normal = rdr_normal / np.sum(rdr_normal)
-    copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin.
-    copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1)
+    copy_single_X_rdr[bidx_inconfident, :] = (
+        0  # avoid ill-defined distributions if normal has 0 count in that bin.
+    )
+    copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum(
+        copy_single_X_rdr, axis=0
+    ).reshape(1, -1)
diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py
index 9bdd862..2585923 100644
--- a/src/calicost/parse_input.py
+++ b/src/calicost/parse_input.py
@@ -7,7 +7,12 @@
 import scanpy as sc
 import anndata
 import logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 logger = logging.getLogger()
 import copy
 from pathlib import Path
@@ -20,21 +25,40 @@
 
 
 def genesnp_to_bininfo(df_gene_snp):
-    table_bininfo = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"CHR":'first', 'START':'first', 'END':'last', 'gene':set, 'snp_id':set}).reset_index()
-    table_bininfo['ARM'] = '.'
-    table_bininfo['INCLUDED_GENES'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.gene.values ]
-    table_bininfo['INCLUDED_SNP_IDS'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ]
-    table_bininfo['NORMAL_COUNT'] = np.nan
-    table_bininfo['N_SNPS'] = [ len([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ]
+    table_bininfo = (
+        df_gene_snp[~df_gene_snp.bin_id.isnull()]
+        .groupby("bin_id")
+        .agg(
+            {
+                "CHR": "first",
+                "START": "first",
+                "END": "last",
+                "gene": set,
+                "snp_id": set,
+            }
+        )
+        .reset_index()
+    )
+    table_bininfo["ARM"] = "."
+    table_bininfo["INCLUDED_GENES"] = [
+        " ".join([x for x in y if not x is None]) for y in table_bininfo.gene.values
+    ]
+    table_bininfo["INCLUDED_SNP_IDS"] = [
+        " ".join([x for x in y if not x is None]) for y in table_bininfo.snp_id.values
+    ]
+    table_bininfo["NORMAL_COUNT"] = np.nan
+    table_bininfo["N_SNPS"] = [
+        len([x for x in y if not x is None]) for y in table_bininfo.snp_id.values
+    ]
     # drop the set columns
-    table_bininfo.drop(columns=['gene', 'snp_id'], inplace=True)
+    table_bininfo.drop(columns=["gene", "snp_id"], inplace=True)
     return table_bininfo
 
 
 def parse_visium(config):
     """
     Read multiple 10X Visium SRT samples and SNP data and generate tables with counts and meta info.
-    
+
     Attributes:
     ----------
     config : dictionary
@@ -61,18 +85,41 @@ def parse_visium(config):
         KNN smoothing matrix.
     """
     if "input_filelist" in config:
-        adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat = load_joint_data(config["input_filelist"], config["snp_dir"], config["alignment_files"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots'])
+        (
+            adata,
+            cell_snp_Aallele,
+            cell_snp_Ballele,
+            unique_snp_ids,
+            across_slice_adjacency_mat,
+        ) = load_joint_data(
+            config["input_filelist"],
+            config["snp_dir"],
+            config["alignment_files"],
+            config["filtergenelist_file"],
+            config["filterregion_file"],
+            config["normalidx_file"],
+            config["min_snpumi_perspot"],
+            config["min_percent_expressed_spots"],
+        )
         sample_list = [adata.obs["sample"][0]]
         for i in range(1, adata.shape[0]):
             if adata.obs["sample"][i] != sample_list[-1]:
-                sample_list.append( adata.obs["sample"][i] )
+                sample_list.append(adata.obs["sample"][i])
         # convert sample name to index
         sample_ids = np.zeros(adata.shape[0], dtype=int)
-        for s,sname in enumerate(sample_list):
+        for s, sname in enumerate(sample_list):
             index = np.where(adata.obs["sample"] == sname)[0]
             sample_ids[index] = s
     else:
-        adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(config["spaceranger_dir"], config["snp_dir"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots'])
+        adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(
+            config["spaceranger_dir"],
+            config["snp_dir"],
+            config["filtergenelist_file"],
+            config["filterregion_file"],
+            config["normalidx_file"],
+            config["min_snpumi_perspot"],
+            config["min_percent_expressed_spots"],
+        )
         adata.obs["sample"] = "unique_sample"
         sample_list = [adata.obs["sample"][0]]
         sample_ids = np.zeros(adata.shape[0], dtype=int)
@@ -81,38 +128,108 @@ def parse_visium(config):
     coords = adata.obsm["X_pos"]
 
     if not config["tumorprop_file"] is None:
-        df_tumorprop = pd.read_csv(config["tumorprop_file"], sep="\t", header=0, index_col=0)
+        df_tumorprop = pd.read_csv(
+            config["tumorprop_file"], sep="\t", header=0, index_col=0
+        )
         df_tumorprop = df_tumorprop[["Tumor"]]
         df_tumorprop.columns = ["tumor_proportion"]
         adata.obs = adata.obs.join(df_tumorprop)
         single_tumor_prop = adata.obs["tumor_proportion"]
     else:
         single_tumor_prop = None
-    
+
     # read original data
-    df_gene_snp = combine_gene_snps(unique_snp_ids, config['hgtable_file'], adata)
-    df_gene_snp = create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids)
-    lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \
-            adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file'])
+    df_gene_snp = combine_gene_snps(unique_snp_ids, config["hgtable_file"], adata)
+    df_gene_snp = create_haplotype_block_ranges(
+        df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids
+    )
+    (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+    ) = summarize_counts_for_blocks(
+        df_gene_snp,
+        adata,
+        cell_snp_Aallele,
+        cell_snp_Ballele,
+        unique_snp_ids,
+        nu=config["nu"],
+        logphase_shift=config["logphase_shift"],
+        geneticmap_file=config["geneticmap_file"],
+    )
     # infer an initial phase using pseudobulk
     if not Path(f"{config['output_dir']}/initial_phase.npz").exists():
-        initial_clone_for_phasing = perform_partition(coords, sample_ids, x_part=config["npart_phasing"], y_part=config["npart_phasing"], single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"])
-        phase_indicator, refined_lengths = initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \
-            "sp", config["t_phaseing"], config["gmm_random_state"], config["fix_NB_dispersion"], config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], 30, 1e-3, threshold=config["tumorprop_threshold"])
-        np.savez(f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths)
+        initial_clone_for_phasing = perform_partition(
+            coords,
+            sample_ids,
+            x_part=config["npart_phasing"],
+            y_part=config["npart_phasing"],
+            single_tumor_prop=single_tumor_prop,
+            threshold=config["tumorprop_threshold"],
+        )
+        phase_indicator, refined_lengths = initial_phase_given_partition(
+            single_X,
+            lengths,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            single_tumor_prop,
+            initial_clone_for_phasing,
+            5,
+            log_sitewise_transmat,
+            "sp",
+            config["t_phaseing"],
+            config["gmm_random_state"],
+            config["fix_NB_dispersion"],
+            config["shared_NB_dispersion"],
+            config["fix_BB_dispersion"],
+            config["shared_BB_dispersion"],
+            30,
+            1e-3,
+            threshold=config["tumorprop_threshold"],
+        )
+        np.savez(
+            f"{config['output_dir']}/initial_phase.npz",
+            phase_indicator=phase_indicator,
+            refined_lengths=refined_lengths,
+        )
         # map phase indicator to individual snps
-        df_gene_snp['phase'] = np.where(df_gene_snp.snp_id.isnull(), None, df_gene_snp.block_id.map({i:x for i,x in enumerate(phase_indicator)}) )
+        df_gene_snp["phase"] = np.where(
+            df_gene_snp.snp_id.isnull(),
+            None,
+            df_gene_snp.block_id.map({i: x for i, x in enumerate(phase_indicator)}),
+        )
     else:
         tmp = dict(np.load(f"{config['output_dir']}/initial_phase.npz"))
-        phase_indicator, refined_lengths = tmp["phase_indicator"], tmp["refined_lengths"]
+        phase_indicator, refined_lengths = (
+            tmp["phase_indicator"],
+            tmp["refined_lengths"],
+        )
 
     # binning
-    df_gene_snp = create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, config['secondary_min_umi'])
-    lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_bins(df_gene_snp, \
-            adata, single_X, single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file'])
+    df_gene_snp = create_bin_ranges(
+        df_gene_snp, single_total_bb_RD, refined_lengths, config["secondary_min_umi"]
+    )
+    (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+    ) = summarize_counts_for_bins(
+        df_gene_snp,
+        adata,
+        single_X,
+        single_total_bb_RD,
+        phase_indicator,
+        nu=config["nu"],
+        logphase_shift=config["logphase_shift"],
+        geneticmap_file=config["geneticmap_file"],
+    )
     # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \
     #     single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi)
-        
+
     # # remove bins where normal spots have imbalanced SNPs
     # if not config["tumorprop_file"] is None:
     #     for prop_threshold in np.arange(0, 0.6, 0.05):
@@ -122,24 +239,41 @@ def parse_visium(config):
     #     index_normal = np.where(normal_candidate)[0]
     #     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
     #             single_X, single_base_nb_mean, single_total_bb_RD, config["nu"], config["logphase_shift"], index_normal, config['geneticmap_file'])
-    #     assert np.sum(lengths) == single_X.shape[0] 
+    #     assert np.sum(lengths) == single_X.shape[0]
     #     assert single_X.shape[0] == single_total_bb_RD.shape[0]
     #     assert single_X.shape[0] == len(log_sitewise_transmat)
 
     # expression count dataframe
-    exp_counts = pd.DataFrame.sparse.from_spmatrix( scipy.sparse.csc_matrix(adata.layers["count"]), index=adata.obs.index, columns=adata.var.index)
+    exp_counts = pd.DataFrame.sparse.from_spmatrix(
+        scipy.sparse.csc_matrix(adata.layers["count"]),
+        index=adata.obs.index,
+        columns=adata.var.index,
+    )
 
     # smooth and adjacency matrix for each sample
-    adjacency_mat, smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, 
-                                                     across_slice_adjacency_mat, construct_adjacency_method=config['construct_adjacency_method'], 
-                                                     maxspots_pooling=config['maxspots_pooling'], construct_adjacency_w=config['construct_adjacency_w'])
+    adjacency_mat, smooth_mat = multislice_adjacency(
+        sample_ids,
+        sample_list,
+        coords,
+        single_total_bb_RD,
+        exp_counts,
+        across_slice_adjacency_mat,
+        construct_adjacency_method=config["construct_adjacency_method"],
+        maxspots_pooling=config["maxspots_pooling"],
+        construct_adjacency_w=config["construct_adjacency_w"],
+    )
     n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten())
     print(f"Set up number of spots to pool in HMRF: {n_pooled}")
 
     # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data)
     # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization
-    if config["construct_adjacency_method"] == "KNN" and config["construct_adjacency_w"] == 0:
-        sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) )
+    if (
+        config["construct_adjacency_method"] == "KNN"
+        and config["construct_adjacency_w"] == 0
+    ):
+        sc.pp.normalize_total(
+            adata, target_sum=np.median(np.sum(exp_counts.values, axis=1))
+        )
         sc.pp.log1p(adata)
         sc.tl.pca(adata)
         sc.pp.neighbors(adata)
@@ -148,35 +282,83 @@ def parse_visium(config):
 
     # create RDR-BAF table
     table_bininfo = genesnp_to_bininfo(df_gene_snp)
-    table_bininfo['LOG_PHASE_TRANSITION'] = log_sitewise_transmat
+    table_bininfo["LOG_PHASE_TRANSITION"] = log_sitewise_transmat
 
     table_rdrbaf = []
     for i in range(single_X.shape[2]):
-        table_rdrbaf.append( pd.DataFrame({"BARCODES":adata.obs.index[i], "EXP":single_X[:,0,i], "TOT":single_total_bb_RD[:,i], "B":single_X[:,1,i]}) )
+        table_rdrbaf.append(
+            pd.DataFrame(
+                {
+                    "BARCODES": adata.obs.index[i],
+                    "EXP": single_X[:, 0, i],
+                    "TOT": single_total_bb_RD[:, i],
+                    "B": single_X[:, 1, i],
+                }
+            )
+        )
     table_rdrbaf = pd.concat(table_rdrbaf, ignore_index=True)
 
     # create meta info table
     # note that table_meta.BARCODES is equal to the unique ones of table_rdrbaf.BARCODES in the original order
-    table_meta = pd.DataFrame({"BARCODES":adata.obs.index, "SAMPLE":adata.obs["sample"], "X":coords[:,0], "Y":coords[:,1]})
+    table_meta = pd.DataFrame(
+        {
+            "BARCODES": adata.obs.index,
+            "SAMPLE": adata.obs["sample"],
+            "X": coords[:, 0],
+            "Y": coords[:, 1],
+        }
+    )
     if not single_tumor_prop is None:
         table_meta["TUMOR_PROPORTION"] = single_tumor_prop
-    
-    return table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp
+
+    return (
+        table_bininfo,
+        table_rdrbaf,
+        table_meta,
+        exp_counts,
+        adjacency_mat,
+        smooth_mat,
+        df_gene_snp,
+    )
 
 
 def load_tables_to_matrices(config):
     """
     Load tables and adjacency from parse_visium_joint or parse_visium_single, and convert to HMM input matrices.
     """
-    table_bininfo = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=0, index_col=None, sep="\t")
-    table_rdrbaf = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=0, index_col=None, sep="\t")
-    table_meta = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=0, index_col=None, sep="\t")
-    adjacency_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz" )
-    smooth_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz" )
+    table_bininfo = pd.read_csv(
+        f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz",
+        header=0,
+        index_col=None,
+        sep="\t",
+    )
+    table_rdrbaf = pd.read_csv(
+        f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz",
+        header=0,
+        index_col=None,
+        sep="\t",
+    )
+    table_meta = pd.read_csv(
+        f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz",
+        header=0,
+        index_col=None,
+        sep="\t",
+    )
+    adjacency_mat = scipy.sparse.load_npz(
+        f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz"
+    )
+    smooth_mat = scipy.sparse.load_npz(
+        f"{config['output_dir']}/parsed_inputs/smooth_mat.npz"
+    )
     #
-    df_gene_snp = pd.read_csv(f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=0, index_col=None, sep="\t")
+    df_gene_snp = pd.read_csv(
+        f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz",
+        header=0,
+        index_col=None,
+        sep="\t",
+    )
     df_gene_snp = df_gene_snp.replace(np.nan, None)
-    
+
     n_spots = table_meta.shape[0]
     n_bins = table_bininfo.shape[0]
 
@@ -187,18 +369,26 @@ def load_tables_to_matrices(config):
     single_X[:, 1, :] = table_rdrbaf["B"].values.reshape((n_bins, n_spots), order="F")
 
     # construct single_base_nb_mean, lengths
-    single_base_nb_mean = table_bininfo["NORMAL_COUNT"].values.reshape(-1,1) / np.sum(table_bininfo["NORMAL_COUNT"].values) @ np.sum(single_X[:,0,:], axis=0).reshape(1,-1)
+    single_base_nb_mean = (
+        table_bininfo["NORMAL_COUNT"].values.reshape(-1, 1)
+        / np.sum(table_bininfo["NORMAL_COUNT"].values)
+        @ np.sum(single_X[:, 0, :], axis=0).reshape(1, -1)
+    )
 
     # construct single_total_bb_RD
-    single_total_bb_RD = table_rdrbaf["TOT"].values.reshape((n_bins, n_spots), order="F")
+    single_total_bb_RD = table_rdrbaf["TOT"].values.reshape(
+        (n_bins, n_spots), order="F"
+    )
 
     # construct log_sitewise_transmat
     log_sitewise_transmat = table_bininfo["LOG_PHASE_TRANSITION"].values
 
     # construct bin info and lengths and x_gene_list
     df_bininfo = table_bininfo
-    lengths = np.array([ np.sum(table_bininfo.CHR == c) for c in df_bininfo.CHR.unique() ])
-    
+    lengths = np.array(
+        [np.sum(table_bininfo.CHR == c) for c in df_bininfo.CHR.unique()]
+    )
+
     # construct barcodes
     barcodes = table_meta["BARCODES"]
 
@@ -206,49 +396,109 @@ def load_tables_to_matrices(config):
     coords = table_meta[["X", "Y"]].values
 
     # construct single_tumor_prop
-    single_tumor_prop = table_meta["TUMOR_PROPORTION"].values if "TUMOR_PROPORTION" in table_meta.columns else None
+    single_tumor_prop = (
+        table_meta["TUMOR_PROPORTION"].values
+        if "TUMOR_PROPORTION" in table_meta.columns
+        else None
+    )
 
     # construct sample_list and sample_ids
     sample_list = [table_meta["SAMPLE"].values[0]]
     for i in range(1, table_meta.shape[0]):
         if table_meta["SAMPLE"].values[i] != sample_list[-1]:
-            sample_list.append( table_meta["SAMPLE"].values[i] )
+            sample_list.append(table_meta["SAMPLE"].values[i])
     sample_ids = np.zeros(table_meta.shape[0], dtype=int)
-    for s,sname in enumerate(sample_list):
+    for s, sname in enumerate(sample_list):
         index = np.where(table_meta["SAMPLE"].values == sname)[0]
         sample_ids[index] = s
 
     # expression UMI count matrix
-    exp_counts = pd.read_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" )
-
-    return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
-        barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts
+    exp_counts = pd.read_pickle(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl")
+
+    return (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+        df_bininfo,
+        df_gene_snp,
+        barcodes,
+        coords,
+        single_tumor_prop,
+        sample_list,
+        sample_ids,
+        adjacency_mat,
+        smooth_mat,
+        exp_counts,
+    )
 
 
 def run_parse_n_load(config):
-    file_exists = np.array([ Path(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz").exists(), \
-                             Path(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz").exists(), \
-                             Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(), \
-                             Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(), \
-                             Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(), \
-                             Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists() ])
+    file_exists = np.array(
+        [
+            Path(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz").exists(),
+            Path(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz").exists(),
+            Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(),
+            Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(),
+            Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(),
+            Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists(),
+        ]
+    )
     if not np.all(file_exists):
         # process to tables
-        table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp = parse_visium(config)
+        (
+            table_bininfo,
+            table_rdrbaf,
+            table_meta,
+            exp_counts,
+            adjacency_mat,
+            smooth_mat,
+            df_gene_snp,
+        ) = parse_visium(config)
         # table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat = parse_hatchetblock(config, cellsnplite_dir, bb_file)
 
         # save file
-        p = subprocess.Popen(f"mkdir -p {config['output_dir']}/parsed_inputs", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        out,err = p.communicate()
-        
-        table_bininfo.to_csv( f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=True, index=False, sep="\t" )
-        table_rdrbaf.to_csv( f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=True, index=False, sep="\t" )
-        table_meta.to_csv( f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=True, index=False, sep="\t" )
-        exp_counts.to_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" )
-        scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat )
-        scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat )
+        p = subprocess.Popen(
+            f"mkdir -p {config['output_dir']}/parsed_inputs",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+        )
+        out, err = p.communicate()
+
+        table_bininfo.to_csv(
+            f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz",
+            header=True,
+            index=False,
+            sep="\t",
+        )
+        table_rdrbaf.to_csv(
+            f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz",
+            header=True,
+            index=False,
+            sep="\t",
+        )
+        table_meta.to_csv(
+            f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz",
+            header=True,
+            index=False,
+            sep="\t",
+        )
+        exp_counts.to_pickle(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl")
+        scipy.sparse.save_npz(
+            f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat
+        )
+        scipy.sparse.save_npz(
+            f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat
+        )
         #
-        df_gene_snp.to_csv( f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=True, index=False, sep="\t" )
+        df_gene_snp.to_csv(
+            f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz",
+            header=True,
+            index=False,
+            sep="\t",
+        )
 
     # load and parse data
     return load_tables_to_matrices(config)
@@ -256,7 +506,13 @@ def run_parse_n_load(config):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
+    parser.add_argument(
+        "-c",
+        "--configfile",
+        help="configuration file of CalicoST",
+        required=True,
+        type=str,
+    )
     args = parser.parse_args()
 
     try:
diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py
index d582ec5..e4c9447 100644
--- a/src/calicost/phasing.py
+++ b/src/calicost/phasing.py
@@ -19,26 +19,60 @@
 from statsmodels.tools.sm_exceptions import ValueWarning
 
 
-def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_RD, n_states, log_sitewise_transmat, \
-    params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol):
+def infer_initial_phase(
+    single_X,
+    lengths,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    n_states,
+    log_sitewise_transmat,
+    params,
+    t,
+    random_state,
+    fix_NB_dispersion,
+    shared_NB_dispersion,
+    fix_BB_dispersion,
+    shared_BB_dispersion,
+    max_iter,
+    tol,
+):
     # pseudobulk HMM for phase_prob
-    res = pipeline_baum_welch(None, np.sum(single_X, axis=2, keepdims=True), lengths, n_states, \
-                              np.sum(single_base_nb_mean, axis=1, keepdims=True), np.sum(single_total_bb_RD, axis=1, keepdims=True), log_sitewise_transmat, \
-                              hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \
-                              fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
-                              fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \
-                              init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol)
+    res = pipeline_baum_welch(
+        None,
+        np.sum(single_X, axis=2, keepdims=True),
+        lengths,
+        n_states,
+        np.sum(single_base_nb_mean, axis=1, keepdims=True),
+        np.sum(single_total_bb_RD, axis=1, keepdims=True),
+        log_sitewise_transmat,
+        hmmclass=hmm_sitewise,
+        params=params,
+        t=t,
+        random_state=random_state,
+        only_minor=True,
+        fix_NB_dispersion=fix_NB_dispersion,
+        shared_NB_dispersion=shared_NB_dispersion,
+        fix_BB_dispersion=fix_BB_dispersion,
+        shared_BB_dispersion=shared_BB_dispersion,
+        is_diag=True,
+        init_log_mu=None,
+        init_p_binom=None,
+        init_alphas=None,
+        init_taus=None,
+        max_iter=max_iter,
+        tol=tol,
+    )
     # phase_prob = np.exp(scipy.special.logsumexp(res["log_gamma"][:n_states, :], axis=0))
     # return phase_prob
     pred = np.argmax(res["log_gamma"], axis=0)
     pred_cnv = pred % n_states
-    phase_indicator = (pred < n_states)
+    phase_indicator = pred < n_states
     refined_lengths = []
     cumlen = 0
     for le in lengths:
         s = 0
-        for i, k in enumerate(pred_cnv[cumlen:(cumlen+le)]):
-            if i > 0 and pred_cnv[i] != pred_cnv[i-1]:
+        for i, k in enumerate(pred_cnv[cumlen : (cumlen + le)]):
+            if i > 0 and pred_cnv[i] != pred_cnv[i - 1]:
                 refined_lengths.append(i - s)
                 s = i
         refined_lengths.append(le - s)
@@ -47,48 +81,119 @@ def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_
     return phase_indicator, refined_lengths
 
 
-def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \
-    params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol, threshold, min_snpumi=2e3):
+def initial_phase_given_partition(
+    single_X,
+    lengths,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    single_tumor_prop,
+    initial_clone_index,
+    n_states,
+    log_sitewise_transmat,
+    params,
+    t,
+    random_state,
+    fix_NB_dispersion,
+    shared_NB_dispersion,
+    fix_BB_dispersion,
+    shared_BB_dispersion,
+    max_iter,
+    tol,
+    threshold,
+    min_snpumi=2e3,
+):
     EPS_BAF = 0.05
     if single_tumor_prop is None:
-        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index)
+        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+            single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
+        )
         tumor_prop = None
     else:
-        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=threshold)
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+            single_X,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            initial_clone_index,
+            single_tumor_prop,
+            threshold=threshold,
+        )
 
     # pseudobulk HMM for phase_prob
     baf_profiles = np.zeros((X.shape[2], X.shape[0]))
     pred_cnv = np.zeros((X.shape[2], X.shape[0]))
     for i in range(X.shape[2]):
-        if np.sum(total_bb_RD[:,i]) < min_snpumi:
-            baf_profiles[i,:] = 0.5
+        if np.sum(total_bb_RD[:, i]) < min_snpumi:
+            baf_profiles[i, :] = 0.5
         else:
-            res = pipeline_baum_welch(None, X[:,:,i:(i+1)], lengths, n_states, base_nb_mean[:,i:(i+1)], total_bb_RD[:,i:(i+1)], log_sitewise_transmat, \
-                                    hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \
-                                    fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
-                                    fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \
-                                    init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol)
+            res = pipeline_baum_welch(
+                None,
+                X[:, :, i : (i + 1)],
+                lengths,
+                n_states,
+                base_nb_mean[:, i : (i + 1)],
+                total_bb_RD[:, i : (i + 1)],
+                log_sitewise_transmat,
+                hmmclass=hmm_sitewise,
+                params=params,
+                t=t,
+                random_state=random_state,
+                only_minor=True,
+                fix_NB_dispersion=fix_NB_dispersion,
+                shared_NB_dispersion=shared_NB_dispersion,
+                fix_BB_dispersion=fix_BB_dispersion,
+                shared_BB_dispersion=shared_BB_dispersion,
+                is_diag=True,
+                init_log_mu=None,
+                init_p_binom=None,
+                init_alphas=None,
+                init_taus=None,
+                max_iter=max_iter,
+                tol=tol,
+            )
             #
             pred = np.argmax(res["log_gamma"], axis=0)
-            this_baf_profiles = np.where(pred < n_states, res["new_p_binom"][pred%n_states, 0], 1-res["new_p_binom"][pred%n_states, 0])
+            this_baf_profiles = np.where(
+                pred < n_states,
+                res["new_p_binom"][pred % n_states, 0],
+                1 - res["new_p_binom"][pred % n_states, 0],
+            )
             this_baf_profiles[np.abs(this_baf_profiles - 0.5) < EPS_BAF] = 0.5
-            baf_profiles[i,:] = this_baf_profiles
-            pred_cnv[i,:] = (pred % n_states)
+            baf_profiles[i, :] = this_baf_profiles
+            pred_cnv[i, :] = pred % n_states
 
     if single_tumor_prop is None:
-        n_total_spots = np.sum([ len(x) for x in initial_clone_index ])
-        population_baf = np.array([ 1.0*len(x)/n_total_spots for x in initial_clone_index]) @ baf_profiles
+        n_total_spots = np.sum([len(x) for x in initial_clone_index])
+        population_baf = (
+            np.array([1.0 * len(x) / n_total_spots for x in initial_clone_index])
+            @ baf_profiles
+        )
     else:
-        n_total_spots = np.sum([ len(x) * tumor_prop[i] for i,x in enumerate(initial_clone_index) ])
-        population_baf = np.array([ 1.0*len(x)*tumor_prop[i]/n_total_spots for i,x in enumerate(initial_clone_index) ]) @ baf_profiles
-    adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1-baf_profiles)
-    phase_indicator = (population_baf < 0.5)
+        n_total_spots = np.sum(
+            [len(x) * tumor_prop[i] for i, x in enumerate(initial_clone_index)]
+        )
+        population_baf = (
+            np.array(
+                [
+                    1.0 * len(x) * tumor_prop[i] / n_total_spots
+                    for i, x in enumerate(initial_clone_index)
+                ]
+            )
+            @ baf_profiles
+        )
+    adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1 - baf_profiles)
+    phase_indicator = population_baf < 0.5
     refined_lengths = []
     cumlen = 0
     for le in lengths:
         s = 0
         for i in range(le):
-            if i > s + 10 and np.any(np.abs(adj_baf_profiles[:,i+cumlen] - adj_baf_profiles[:,i+cumlen-1]) > 0.1):
+            if i > s + 10 and np.any(
+                np.abs(
+                    adj_baf_profiles[:, i + cumlen]
+                    - adj_baf_profiles[:, i + cumlen - 1]
+                )
+                > 0.1
+            ):
                 refined_lengths.append(i - s)
                 s = i
         refined_lengths.append(le - s)
@@ -99,13 +204,21 @@ def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single
 
 def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, threshold):
     initial_clone_index = []
-    for s in range(np.max(sample_ids)+1):
+    for s in range(np.max(sample_ids) + 1):
         index = np.where(sample_ids == s)[0]
         assert len(index) > 0
         if single_tumor_prop is None:
-            tmp_clone_index = fixed_rectangle_initialization(coords[index,:], x_part, y_part)
+            tmp_clone_index = fixed_rectangle_initialization(
+                coords[index, :], x_part, y_part
+            )
         else:
-            tmp_clone_index = fixed_rectangle_initialization_mix(coords[index,:], x_part, y_part, single_tumor_prop[index], threshold=threshold)
+            tmp_clone_index = fixed_rectangle_initialization_mix(
+                coords[index, :],
+                x_part,
+                y_part,
+                single_tumor_prop[index],
+                threshold=threshold,
+            )
         for x in tmp_clone_index:
-            initial_clone_index.append( index[x] )
+            initial_clone_index.append(index[x])
     return initial_clone_index
diff --git a/src/calicost/phylogeny_startle.py b/src/calicost/phylogeny_startle.py
index 9265224..2b916e9 100644
--- a/src/calicost/phylogeny_startle.py
+++ b/src/calicost/phylogeny_startle.py
@@ -28,32 +28,36 @@ def get_LoH_for_phylogeny(df_seglevel_cnv, min_segments):
     ----------
     df_loh : pd.DataFrame, (n_clones, n_segments)
     """
+
     def get_shared_intervals(acn_profile):
-        '''
+        """
         Takes in allele-specific copy numbers, output a segmentation of genome such that all clones are in the same CN state within each segment.
 
         anc_profile : array, (n_obs, 2*n_clones)
             Allele-specific integer copy numbers for each genomic bin (obs) across all clones.
-        '''
+        """
         intervals = []
         seg_acn = []
         s = 0
         while s < acn_profile.shape[0]:
-            t = np.where( ~np.all(acn_profile[s:,] == acn_profile[s,:], axis=1) )[0]
+            t = np.where(~np.all(acn_profile[s:,] == acn_profile[s, :], axis=1))[0]
             if len(t) == 0:
-                intervals.append( (s, acn_profile.shape[0])  )
-                seg_acn.append( acn_profile[s,:] )
+                intervals.append((s, acn_profile.shape[0]))
+                seg_acn.append(acn_profile[s, :])
                 s = acn_profile.shape[0]
             else:
                 t = t[0]
-                intervals.append( (s,s+t) )
-                seg_acn.append( acn_profile[s,:] )
-                s = s+t
+                intervals.append((s, s + t))
+                seg_acn.append(acn_profile[s, :])
+                s = s + t
         return intervals, seg_acn
-    
-    clone_ids = [x.split(" ")[0] for x in df_seglevel_cnv.columns[ np.arange(3, df_seglevel_cnv.shape[1], 2) ] ]
-    
-    acn_profile = df_seglevel_cnv.iloc[:,3:].values
+
+    clone_ids = [
+        x.split(" ")[0]
+        for x in df_seglevel_cnv.columns[np.arange(3, df_seglevel_cnv.shape[1], 2)]
+    ]
+
+    acn_profile = df_seglevel_cnv.iloc[:, 3:].values
     intervals, seg_acn = get_shared_intervals(acn_profile)
     df_loh = []
     for i, acn in enumerate(seg_acn):
@@ -63,18 +67,24 @@ def get_shared_intervals(acn_profile):
             continue
         idx_zero = np.where(acn == 0)[0]
         idx_clones = (idx_zero / 2).astype(int)
-        is_A = (idx_zero % 2 == 0)
+        is_A = idx_zero % 2 == 0
         # vector of mutation states
-        mut = np.zeros( int(len(acn) / 2), dtype=int )
+        mut = np.zeros(int(len(acn) / 2), dtype=int)
         mut[idx_clones] = np.where(is_A, 1, 2)
-        df_loh.append( pd.DataFrame(mut.reshape(1, -1), index=[f"bin_{intervals[i][0]}_{intervals[i][1]}"], columns=clone_ids) )
+        df_loh.append(
+            pd.DataFrame(
+                mut.reshape(1, -1),
+                index=[f"bin_{intervals[i][0]}_{intervals[i][1]}"],
+                columns=clone_ids,
+            )
+        )
 
     df_loh = pd.concat(df_loh).T
     return df_loh
 
 
 def get_binary_matrix(df_character_matrix):
-    
+
     ncells = len(df_character_matrix)
     binary_col_dict = {}
     for column in df_character_matrix.columns:
@@ -85,38 +95,40 @@ def get_binary_matrix(df_character_matrix):
                 state_col[df_character_matrix[column] == s] = 1
                 state_col[df_character_matrix[column] == -1] = -1
 
-                binary_col_dict[f'{column}_{s}'] = state_col
+                binary_col_dict[f"{column}_{s}"] = state_col
 
-    df_binary = pd.DataFrame(binary_col_dict, index = df_character_matrix.index, dtype=int)
+    df_binary = pd.DataFrame(
+        binary_col_dict, index=df_character_matrix.index, dtype=int
+    )
     return df_binary
 
 
 def generate_perfect_phylogeny(df_binary):
 
     solT_mut = nx.DiGraph()
-    solT_mut.add_node('root')
+    solT_mut.add_node("root")
 
     solT_cell = nx.DiGraph()
-    solT_cell.add_node('root')
+    solT_cell.add_node("root")
 
-    df_binary = df_binary[df_binary.sum().sort_values(ascending=False).index]    
+    df_binary = df_binary[df_binary.sum().sort_values(ascending=False).index]
 
     for cell_id, row in df_binary.iterrows():
-        if cell_id == 'root':
+        if cell_id == "root":
             continue
 
-        curr_node = 'root'
+        curr_node = "root"
         for column in df_binary.columns[row.values == 1]:
             if column in solT_mut[curr_node]:
                 curr_node = column
             else:
                 if column in solT_mut.nodes:
-                    raise NameError(f'{column} is being repeated')
+                    raise NameError(f"{column} is being repeated")
                 solT_mut.add_edge(curr_node, column)
                 solT_cell.add_edge(curr_node, column)
                 curr_node = column
 
-        solT_cell.add_edge(curr_node, cell_id)   
+        solT_cell.add_edge(curr_node, cell_id)
 
     return solT_mut, solT_cell
 
@@ -138,17 +150,21 @@ def tree_to_newick(T, root=None):
             pathlen += 1
             subgs.append(tree_to_newick(T, root=child) + f":{pathlen}")
         else:
-            subgs.append( f"{child}:{pathlen}" )
-    return "(" + ','.join(map(str, subgs)) + ")"
+            subgs.append(f"{child}:{pathlen}")
+    return "(" + ",".join(map(str, subgs)) + ")"
 
 
-def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="startle", min_segments=3):
+def output_startle_input_files(
+    calicostdir, outdir, midfix="", startle_bin="startle", min_segments=3
+):
     # get LoH data frame
     # rows are clones, columns are bins, entries are 0 (no LoH) or 1 (A allele LoH) of 2 (B allele LoH)
-    df_seglevel_cnv = pd.read_csv(f"{calicostdir}/cnv{midfix}_seglevel.tsv", header=0, sep="\t")
+    df_seglevel_cnv = pd.read_csv(
+        f"{calicostdir}/cnv{midfix}_seglevel.tsv", header=0, sep="\t"
+    )
     df_loh = get_LoH_for_phylogeny(df_seglevel_cnv, min_segments)
     df_loh.to_csv(f"{outdir}/loh_matrix.tsv", header=True, index=True, sep="\t")
-    
+
     # binarize
     df_binary = get_binary_matrix(df_loh)
 
@@ -163,36 +179,40 @@ def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="star
         for mut_idx, mut in enumerate(mutation_list):
             if df_binary.loc[cell][mut] == 1:
                 one_cell_mut_list.append((cell_idx, mut_idx))
-    with open(f'{outdir}/loh_one_indices.txt', 'w') as out:
+    with open(f"{outdir}/loh_one_indices.txt", "w") as out:
         for cell_idx, mut_idx in one_cell_mut_list:
-            out.write(f'{cell_idx} {mut_idx}\n')
+            out.write(f"{cell_idx} {mut_idx}\n")
     # missimg imdices
-    character_list = list(set(['_'.join(x.split('_')[:-1]) for x in df_binary.columns]))
+    character_list = list(set(["_".join(x.split("_")[:-1]) for x in df_binary.columns]))
     missing_cell_character_list = []
     for character_idx, character in enumerate(character_list):
         for cell_idx, cell in enumerate(cell_list):
             if df_loh.loc[cell][character] == -1:
                 missing_cell_character_list.append((cell_idx, character_idx))
-    with open(f'{outdir}/loh_missing_indices.txt', 'w') as out:
+    with open(f"{outdir}/loh_missing_indices.txt", "w") as out:
         for cell_idx, character_idx in missing_cell_character_list:
-            out.write(f'{cell_idx} {character_idx}\n')
+            out.write(f"{cell_idx} {character_idx}\n")
 
     # character mutation mapping
-    with open(f'{outdir}/loh_character_mutation_mapping.txt', 'w') as out:
+    with open(f"{outdir}/loh_character_mutation_mapping.txt", "w") as out:
         for _, character in enumerate(character_list):
-            character_mutation_list = [mutation_to_index[x] for x in mutation_list if x.startswith(f'{character}_')]
-            out.write(' '.join(map(str, character_mutation_list)) + '\n')
+            character_mutation_list = [
+                mutation_to_index[x]
+                for x in mutation_list
+                if x.startswith(f"{character}_")
+            ]
+            out.write(" ".join(map(str, character_mutation_list)) + "\n")
 
     # count of character states of mutations
     max_allowed_homoplasy = {}
     for mutation in mutation_list:
         max_allowed_homoplasy[mutation] = 2
-    with open(f'{outdir}/loh_counts.txt', 'w') as out:
+    with open(f"{outdir}/loh_counts.txt", "w") as out:
         for mutation in mutation_list:
-            out.write(f'{max_allowed_homoplasy[mutation]}\n')
-    
+            out.write(f"{max_allowed_homoplasy[mutation]}\n")
+
     # weights
-    with open(f'{outdir}/loh_weights.txt', 'w') as out:
+    with open(f"{outdir}/loh_weights.txt", "w") as out:
         for mutation in mutation_list:
             out.write(f"1\n")
 
@@ -200,35 +220,71 @@ def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="star
     m_mutations = df_binary.shape[1]
     n_clones = df_binary.shape[0]
     command = f"{startle_bin} -m {m_mutations} -n {n_clones} {outdir}/loh_one_indices.txt {outdir}/loh_missing_indices.txt {outdir}/loh_counts.txt {outdir}/loh_character_mutation_mapping.txt {outdir}/loh_weights.txt {outdir}/loh_cpp_output.txt"
-    print( command )
-    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-    out,err = p.communicate()
+    print(command)
+    p = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
+    )
+    out, err = p.communicate()
 
     # parse output
-    df_cpp_output = pd.read_csv(f'{outdir}/loh_cpp_output.txt', header=None, sep=' ')
-    df_cpp_output = df_cpp_output.rename(columns={0:'cell_idx', 1:'mut_idx', 2:'state_idx', 3:'entry'})
-    df_cpp_output['name'] = df_cpp_output.apply(lambda x: f"{mutation_list[x['mut_idx']]}_{x['state_idx']}", axis =1)
-    
-    sol_columns = list(df_cpp_output['name'].unique())
+    df_cpp_output = pd.read_csv(f"{outdir}/loh_cpp_output.txt", header=None, sep=" ")
+    df_cpp_output = df_cpp_output.rename(
+        columns={0: "cell_idx", 1: "mut_idx", 2: "state_idx", 3: "entry"}
+    )
+    df_cpp_output["name"] = df_cpp_output.apply(
+        lambda x: f"{mutation_list[x['mut_idx']]}_{x['state_idx']}", axis=1
+    )
+
+    sol_columns = list(df_cpp_output["name"].unique())
     nsol_columns = len(sol_columns)
     sol_entries = np.zeros((n_clones, nsol_columns), dtype=int)
     for mut_idx, mut in enumerate(sol_columns):
-        for cell_idx in df_cpp_output[(df_cpp_output['entry'] == 1) & (df_cpp_output['name'] == mut)]['cell_idx']:
+        for cell_idx in df_cpp_output[
+            (df_cpp_output["entry"] == 1) & (df_cpp_output["name"] == mut)
+        ]["cell_idx"]:
             sol_entries[cell_idx][mut_idx] = 1
     df_sol_binary = pd.DataFrame(sol_entries, columns=sol_columns, index=cell_list)
 
     solT_mut, solT_cell = generate_perfect_phylogeny(df_sol_binary)
-    with open(f'{outdir}/loh_tree.newick', 'w') as out:
+    with open(f"{outdir}/loh_tree.newick", "w") as out:
         out.write(f"{tree_to_newick(solT_cell)};")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--calicost_dir", help="Directory of a specific random initialization of CalicoST", type=str)
-    parser.add_argument("-s", "--startle_bin", help="The startle executable path", default="startle", type=str)
-    parser.add_argument("-p", "--ploidy", help="Ploidy of allele-specific integer copy numbers.", default="", type=str)
-    parser.add_argument("--min_segments", help="Minimum number of genome segment to keep an LOH event in phylogenetic tree reconstruction.", default=3, type=int)
+    parser.add_argument(
+        "-c",
+        "--calicost_dir",
+        help="Directory of a specific random initialization of CalicoST",
+        type=str,
+    )
+    parser.add_argument(
+        "-s",
+        "--startle_bin",
+        help="The startle executable path",
+        default="startle",
+        type=str,
+    )
+    parser.add_argument(
+        "-p",
+        "--ploidy",
+        help="Ploidy of allele-specific integer copy numbers.",
+        default="",
+        type=str,
+    )
+    parser.add_argument(
+        "--min_segments",
+        help="Minimum number of genome segment to keep an LOH event in phylogenetic tree reconstruction.",
+        default=3,
+        type=int,
+    )
     parser.add_argument("-o", "--outputdir", help="output directory", type=str)
     args = parser.parse_args()
 
-    output_startle_input_files(args.calicost_dir, args.outputdir, midfix=args.ploidy, startle_bin=args.startle_bin, min_segments=args.min_segments)
\ No newline at end of file
+    output_startle_input_files(
+        args.calicost_dir,
+        args.outputdir,
+        midfix=args.ploidy,
+        startle_bin=args.startle_bin,
+        min_segments=args.min_segments,
+    )
diff --git a/src/calicost/phylogeography.py b/src/calicost/phylogeography.py
index e859350..8b188f2 100644
--- a/src/calicost/phylogeography.py
+++ b/src/calicost/phylogeography.py
@@ -8,40 +8,66 @@
 import networkx as nx
 
 
-def clone_centers(coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None, tumorprop_threshold=0.6):
+def clone_centers(
+    coords,
+    clone_label,
+    single_tumor_prop=None,
+    sample_list=None,
+    sample_ids=None,
+    tumorprop_threshold=0.6,
+):
     df_centers = []
     for l in np.unique(clone_label):
         # get spot indices of this clone
-        index = np.where(clone_label == l)[0] if single_tumor_prop is None else np.where((clone_label == l) & (single_tumor_prop > tumorprop_threshold))[0]
+        index = (
+            np.where(clone_label == l)[0]
+            if single_tumor_prop is None
+            else np.where(
+                (clone_label == l) & (single_tumor_prop > tumorprop_threshold)
+            )[0]
+        )
         # if the index contains multiple slices, get the most abundance slice
         if not sample_ids is None:
             most_abundance_slice = pd.Series(sample_ids[index]).mode().values[0]
-            index = index[ sample_ids[index] == most_abundance_slice ]
+            index = index[sample_ids[index] == most_abundance_slice]
         # get clone cencer
         if single_tumor_prop is None:
             center = np.mean(coords[index], axis=0)
         else:
-            center = single_tumor_prop[index].dot(coords[index]) / np.sum(single_tumor_prop[index])
-        df_centers.append( pd.DataFrame({'clone':l, 'x':center[0], 'y':center[1]}, index=[0]) )
+            center = single_tumor_prop[index].dot(coords[index]) / np.sum(
+                single_tumor_prop[index]
+            )
+        df_centers.append(
+            pd.DataFrame({"clone": l, "x": center[0], "y": center[1]}, index=[0])
+        )
     df_centers = pd.concat(df_centers, ignore_index=True)
     return df_centers
 
 
-def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None):
+def project_phylogeneny_space(
+    newick_file,
+    coords,
+    clone_label,
+    single_tumor_prop=None,
+    sample_list=None,
+    sample_ids=None,
+):
     # load tree
-    with open(newick_file, 'r') as fp:
+    with open(newick_file, "r") as fp:
         t = Tree(fp.readline())
-    
-    # get the 
+
+    # get the
     list_leaf_nodes = []
     list_internal_nodes = []
-    rootnode = np.sort( [leaf.name.replace('clone','') for leaf in t.iter_leaves() ] )
-    rootnode = "ancestor" + "_".join( rootnode )
+    rootnode = np.sort([leaf.name.replace("clone", "") for leaf in t.iter_leaves()])
+    rootnode = "ancestor" + "_".join(rootnode)
     for node in t.traverse():
-        leafnames = np.sort( [leaf.name.replace('clone','') for leaf in node.iter_leaves() ] )
+        leafnames = np.sort(
+            [leaf.name.replace("clone", "") for leaf in node.iter_leaves()]
+        )
         if node.name == "":
-            node.name = "ancestor" + "_".join( leafnames )
-        
+            node.name = "ancestor" + "_".join(leafnames)
+
         if node.is_leaf():
             list_leaf_nodes.append(node.name)
         else:
@@ -50,27 +76,27 @@ def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_pro
     print(f"root node is {rootnode}")
     print(f"a list of leaf nodes: {list_leaf_nodes}")
     print(f"a list of internal nodes: {list_internal_nodes}")
-    
+
     # set up multivariate Gaussian distribution to estimate internal node location
     N_nodes = len(list_leaf_nodes) + len(list_internal_nodes)
     # pairwise distance
     G = nx.Graph()
-    G.add_nodes_from( list_leaf_nodes + list_internal_nodes )
+    G.add_nodes_from(list_leaf_nodes + list_internal_nodes)
     for nodename in list_leaf_nodes:
-        node = t&f"{nodename}"
+        node = t & f"{nodename}"
         while not node.is_root():
             p = node.up
             G.add_edge(node.name, p.name, weight=node.dist)
             node = p
-    
+
     G.edges(data=True)
-    nx_pdc = dict( nx.all_pairs_dijkstra(G) )
+    nx_pdc = dict(nx.all_pairs_dijkstra(G))
 
     # covariance matrix based on pairwise distance
     N_nodes = len(list_leaf_nodes) + len(list_internal_nodes)
     Sigma_square = np.zeros((N_nodes, N_nodes))
-    base_var = max( np.max(np.abs(coords[:,0])), np.max(np.abs(coords[:,1])) )
-    
+    base_var = max(np.max(np.abs(coords[:, 0])), np.max(np.abs(coords[:, 1])))
+
     for n1, name1 in enumerate(list_leaf_nodes + list_internal_nodes):
         for n2, name2 in enumerate(list_leaf_nodes + list_internal_nodes):
             if n1 == n2:
@@ -84,26 +110,42 @@ def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_pro
                     Sigma_square[n1, n2] = base_var + nx_pdc[rootnode][0][lca_node.name]
 
     # mean position
-    mu_1 = np.zeros(( len(list_leaf_nodes),2 ))
-    mu_2 = np.zeros(( len(list_internal_nodes),2 ))
+    mu_1 = np.zeros((len(list_leaf_nodes), 2))
+    mu_2 = np.zeros((len(list_internal_nodes), 2))
 
     # partition covariance matrix
-    Sigma_11 = Sigma_square[:len(list_leaf_nodes), :len(list_leaf_nodes)]
-    Sigma_12 = Sigma_square[:len(list_leaf_nodes), :][:, len(list_leaf_nodes):]
-    Sigma_22 = Sigma_square[len(list_leaf_nodes):, len(list_leaf_nodes):]
+    Sigma_11 = Sigma_square[: len(list_leaf_nodes), : len(list_leaf_nodes)]
+    Sigma_12 = Sigma_square[: len(list_leaf_nodes), :][:, len(list_leaf_nodes) :]
+    Sigma_22 = Sigma_square[len(list_leaf_nodes) :, len(list_leaf_nodes) :]
 
     # get leaf node locations
-    df_centers = clone_centers(coords, clone_label, single_tumor_prop=single_tumor_prop, 
-                               sample_list=sample_list, sample_ids=sample_ids)
-    obs_1 = df_centers.set_index('clone').loc[list_leaf_nodes].values
+    df_centers = clone_centers(
+        coords,
+        clone_label,
+        single_tumor_prop=single_tumor_prop,
+        sample_list=sample_list,
+        sample_ids=sample_ids,
+    )
+    obs_1 = df_centers.set_index("clone").loc[list_leaf_nodes].values
 
     # conditional expectation internal node position | leaf node position = mu_1
     expected_internal = mu_2 + Sigma_12.T @ (np.linalg.inv(Sigma_11) @ (obs_1 - mu_1))
-    df_centers = pd.concat([ df_centers, pd.DataFrame({'clone':list_internal_nodes, 'x':expected_internal[:,0], 'y':expected_internal[:,1]}) ])
+    df_centers = pd.concat(
+        [
+            df_centers,
+            pd.DataFrame(
+                {
+                    "clone": list_internal_nodes,
+                    "x": expected_internal[:, 0],
+                    "y": expected_internal[:, 1],
+                }
+            ),
+        ]
+    )
 
     # add to tree features
     for node in t.traverse():
         i = np.where(df_centers.clone.values == node.name)[0][0]
-        node.add_features( x=df_centers.x.values[i], y=df_centers.y.values[i] )
+        node.add_features(x=df_centers.x.values[i], y=df_centers.y.values[i])
 
-    return t
\ No newline at end of file
+    return t
diff --git a/src/calicost/simple_sctransform.py b/src/calicost/simple_sctransform.py
index 1a011b1..ca7666c 100644
--- a/src/calicost/simple_sctransform.py
+++ b/src/calicost/simple_sctransform.py
@@ -7,42 +7,70 @@
 
 
 # copied from sctransformPy
-def theta_ml(y,mu):
+def theta_ml(y, mu):
     n = y.size
     weights = np.ones(n)
     limit = 10
     _EPS = np.finfo(float).eps
-    eps = (_EPS)**0.25
+    eps = (_EPS) ** 0.25
+
     # inner function
-    def score(n,th,mu,y,w):
-        return sum(w*(psi(th + y) - psi(th) + np.log(th) + 1 - np.log(th + mu) - (y + th)/(mu + th)))
+    def score(n, th, mu, y, w):
+        return sum(
+            w
+            * (
+                psi(th + y)
+                - psi(th)
+                + np.log(th)
+                + 1
+                - np.log(th + mu)
+                - (y + th) / (mu + th)
+            )
+        )
+
     # inner function
-    def info(n,th,mu,y,w):
-        return sum(w*( - polygamma(1,th + y) + polygamma(1,th) - 1/th + 2/(mu + th) - (y + th)/(mu + th)**2))
+    def info(n, th, mu, y, w):
+        return sum(
+            w
+            * (
+                -polygamma(1, th + y)
+                + polygamma(1, th)
+                - 1 / th
+                + 2 / (mu + th)
+                - (y + th) / (mu + th) ** 2
+            )
+        )
+
     # initialize gradient descent
-    t0 = n/sum(weights*(y/mu - 1)**2)
+    t0 = n / sum(weights * (y / mu - 1) ** 2)
     it = 0
     de = 1
     # gradient descent
-    while(it + 1 < limit and abs(de) > eps):
-        it+=1
+    while it + 1 < limit and abs(de) > eps:
+        it += 1
         t0 = abs(t0)
         i = info(n, t0, mu, y, weights)
-        de = score(n, t0, mu, y, weights)/i
-        t0 += de        
-    t0 = max(t0,0)
+        de = score(n, t0, mu, y, weights) / i
+        t0 += de
+    t0 = max(t0, 0)
     # note that t0 is the dispersion parameter: var = mu + mu^2 / t0
     return t0
 
 
 def sample_gene_indices(log_geometric_mean, n_subsample, n_partitions=10):
-    bounds = np.linspace(np.min(log_geometric_mean), np.max(log_geometric_mean), n_partitions+1)
+    bounds = np.linspace(
+        np.min(log_geometric_mean), np.max(log_geometric_mean), n_partitions + 1
+    )
     bounds[-1] += 1e-4
     idx_subsample = []
     for p in range(1, n_partitions):
-        tmpidx = np.where(np.logical_and(log_geometric_mean >= bounds[p-1], log_geometric_mean < bounds[p]))[0]
+        tmpidx = np.where(
+            np.logical_and(
+                log_geometric_mean >= bounds[p - 1], log_geometric_mean < bounds[p]
+            )
+        )[0]
         np.random.shuffle(tmpidx)
-        idx_subsample.append(tmpidx[:int(n_subsample/n_partitions)])
+        idx_subsample.append(tmpidx[: int(n_subsample / n_partitions)])
     idx_subsample = np.sort(np.concatenate(idx_subsample))
     if len(idx_subsample) < n_subsample:
         mask = np.array([True] * len(log_geometric_mean))
@@ -55,120 +83,128 @@ def sample_gene_indices(log_geometric_mean, n_subsample, n_partitions=10):
 
 
 def estimate_logmu_dispersion(counts, bw=None):
-    '''
+    """
     counts of size number spots * number genes.
-    '''
+    """
     N = counts.shape[0]
     G = counts.shape[1]
     eps = 1
-    geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps
-    log_geometric_mean = np.log( geometric_mean )
+    geometric_mean = np.exp(np.log(counts + eps).mean(axis=0).flatten()) - eps
+    log_geometric_mean = np.log(geometric_mean)
     spot_umi = counts.sum(axis=1)
     # fitting logmu and theta (dispersion)
     logmu = np.zeros(G)
     theta = np.zeros(G)
     for i in range(G):
-        y = counts[:,i]
-        logmu[i] = np.log( np.sum(y) / np.sum(spot_umi) )
+        y = counts[:, i]
+        logmu[i] = np.log(np.sum(y) / np.sum(spot_umi))
         mu = spot_umi * np.exp(logmu[i])
         theta[i] = theta_ml(y, mu)
     # ratio between geometric mean and dispersion parameter theta
     log_ratio = np.log(1 + geometric_mean / theta)
     # smoothing parameter for kernel ridge regression
     if bw is None:
-        z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean)
-        z.evaluate();
+        z = FFTKDE(kernel="gaussian", bw="ISJ").fit(log_geometric_mean)
+        z.evaluate()
         bw_adjust = 3
-        bw = z.bw*bw_adjust
+        bw = z.bw * bw_adjust
     # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion)
-    kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[:,None], ['c'], reg_type='ll', bw=[bw])
-    pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0]
+    kr = statsmodels.nonparametric.kernel_regression.KernelReg(
+        log_ratio, log_geometric_mean[:, None], ["c"], reg_type="ll", bw=[bw]
+    )
+    pred_log_ratio = kr.fit(data_predict=log_geometric_mean[:, None])[0]
     pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1)
     return logmu, pred_theta
 
 
 def pearson_residual(counts, logmu, pred_theta):
-    '''
+    """
     counts of size number spots * number genes.
-    '''
+    """
     N = counts.shape[0]
     G = counts.shape[1]
     spot_umi = counts.sum(axis=1)
     # predicted mean and variance under NB model
-    mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1)
-    vard = mud + mud**2 / pred_theta.reshape(1,-1)
+    mud = np.exp(logmu.reshape(1, -1)) * spot_umi.reshape(-1, 1)
+    vard = mud + mud**2 / pred_theta.reshape(1, -1)
     X = (counts * 1.0 - mud) / vard**0.5
     # clipping
-    clip = np.sqrt(counts.shape[0]/30)
+    clip = np.sqrt(counts.shape[0] / 30)
     X[X > clip] = clip
     X[X < -clip] = -clip
     return X
 
 
 def deviance_residual(counts, logmu, pred_theta):
-    '''
+    """
     Equation is taken from Analytic Pearson Residual paper by Lause et al.
     counts of size number spots * number genes.
-    '''
+    """
     N = counts.shape[0]
     G = counts.shape[1]
     spot_umi = counts.sum(axis=1)
     # predicted mean
-    mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1)
-    sign = (counts > mud)
+    mud = np.exp(logmu.reshape(1, -1)) * spot_umi.reshape(-1, 1)
+    sign = counts > mud
     part1 = counts * np.log(counts / mud)
-    part1[counts==0] = 0
-    part2 = (counts + pred_theta) * np.log( (counts + pred_theta) / (mud + pred_theta) )
+    part1[counts == 0] = 0
+    part2 = (counts + pred_theta) * np.log((counts + pred_theta) / (mud + pred_theta))
     X = sign * np.sqrt(2 * (part1 - part2))
     return X
 
 
 def estimate_logmu_dispersion2(counts, n_subsample=None, bw=None):
-    '''
+    """
     counts of size number spots * number genes.
-    '''
+    """
     N = counts.shape[0]
     G = counts.shape[1]
     eps = 1
-    geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps
-    log_geometric_mean = np.log( geometric_mean )
+    geometric_mean = np.exp(np.log(counts + eps).mean(axis=0).flatten()) - eps
+    log_geometric_mean = np.log(geometric_mean)
     spot_umi = counts.sum(axis=1)
-    logmu = np.log( np.sum(counts, axis=0) / np.sum(spot_umi) )
+    logmu = np.log(np.sum(counts, axis=0) / np.sum(spot_umi))
     # fitting theta (dispersion)
     genes_subsample = np.array([i for i in range(G) if geometric_mean[i] > 0])
     if not (n_subsample is None):
         np.random.seed(0)
         genes_subsample = sample_gene_indices(log_geometric_mean, n_subsample)
     theta = np.zeros(len(genes_subsample))
-    for idx,i in enumerate(genes_subsample):
-        y = counts[:,i]
+    for idx, i in enumerate(genes_subsample):
+        y = counts[:, i]
         mu = spot_umi * np.exp(logmu[i])
         theta[idx] = theta_ml(y, mu)
     # ratio between geometric mean and dispersion parameter theta
     log_ratio = np.log(1 + geometric_mean[genes_subsample] / theta)
     # smoothing parameter for kernel ridge regression
     if bw is None:
-        z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean[genes_subsample])
-        z.evaluate();
+        z = FFTKDE(kernel="gaussian", bw="ISJ").fit(log_geometric_mean[genes_subsample])
+        z.evaluate()
         bw_adjust = 3
-        bw = z.bw*bw_adjust
+        bw = z.bw * bw_adjust
     # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion)
-    kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[genes_subsample][:,None], ['c'], reg_type='ll', bw=[bw])
-    pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0]
+    kr = statsmodels.nonparametric.kernel_regression.KernelReg(
+        log_ratio,
+        log_geometric_mean[genes_subsample][:, None],
+        ["c"],
+        reg_type="ll",
+        bw=[bw],
+    )
+    pred_log_ratio = kr.fit(data_predict=log_geometric_mean[:, None])[0]
     pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1)
     return logmu, pred_theta
 
 
 def pearson_residual2(counts, logmu, pred_theta):
-    '''
+    """
     counts of size number spots * number genes.
-    '''
+    """
     N = counts.shape[0]
     G = counts.shape[1]
     spot_umi = counts.sum(axis=1)
     # predicted mean and variance under NB model
-    mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1)
-    vard = mud + mud**2 / pred_theta.reshape(1,-1)
+    mud = np.exp(logmu.reshape(1, -1)) * spot_umi.reshape(-1, 1)
+    vard = mud + mud**2 / pred_theta.reshape(1, -1)
     X = (counts * 1.0 - mud) / vard**0.5
     # clipping
     clip = np.sqrt(counts.shape[0])
diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index f248036..82138a2 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -11,7 +11,12 @@
 import scanpy as sc
 import anndata
 import logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 logger = logging.getLogger()
 
 from calicost.utils_phase_switch import *
@@ -19,28 +24,48 @@
 import subprocess
 
 
-def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, normalidx_file, min_snpumis=50, min_percent_expressed_spots=0.005):
+def load_data(
+    spaceranger_dir,
+    snp_dir,
+    filtergenelist_file,
+    filterregion_file,
+    normalidx_file,
+    min_snpumis=50,
+    min_percent_expressed_spots=0.005,
+):
     ##### read raw UMI count matrix #####
     if Path(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5").exists():
         adata = sc.read_10x_h5(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5")
     elif Path(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5ad").exists():
         adata = sc.read_h5ad(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5ad")
     else:
-        logging.error(f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!")
+        logging.error(
+            f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!"
+        )
 
     adata.layers["count"] = adata.X.A.astype(int)
     cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz")
     cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz")
     unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True)
-    snp_barcodes = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"])
+    snp_barcodes = pd.read_csv(
+        f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"]
+    )
 
     # add position
     if Path(f"{spaceranger_dir}/spatial/tissue_positions.csv").exists():
-        df_pos = pd.read_csv(f"{spaceranger_dir}/spatial/tissue_positions.csv", sep=",", header=0, \
-                        names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"])
+        df_pos = pd.read_csv(
+            f"{spaceranger_dir}/spatial/tissue_positions.csv",
+            sep=",",
+            header=0,
+            names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"],
+        )
     elif Path(f"{spaceranger_dir}/spatial/tissue_positions_list.csv").exists():
-        df_pos = pd.read_csv(f"{spaceranger_dir}/spatial/tissue_positions_list.csv", sep=",", header=None, \
-                        names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"])
+        df_pos = pd.read_csv(
+            f"{spaceranger_dir}/spatial/tissue_positions_list.csv",
+            sep=",",
+            header=None,
+            names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"],
+        )
     else:
         raise Exception("No spatial coordinate file!")
     df_pos = df_pos[df_pos.in_tissue == True]
@@ -50,7 +75,9 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file,
     adata = adata[adata.obs.index.isin(shared_barcodes), :]
     df_pos = df_pos[df_pos.barcode.isin(shared_barcodes)]
     # sort and match
-    df_pos.barcode = pd.Categorical(df_pos.barcode, categories=list(adata.obs.index), ordered=True)
+    df_pos.barcode = pd.Categorical(
+        df_pos.barcode, categories=list(adata.obs.index), ordered=True
+    )
     df_pos.sort_values(by="barcode", inplace=True)
     adata.obsm["X_pos"] = np.vstack([df_pos.x, df_pos.y]).T
 
@@ -60,114 +87,192 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file,
     cell_snp_Ballele = cell_snp_Ballele[snp_barcodes.barcodes.isin(shared_barcodes), :]
     snp_barcodes = snp_barcodes[snp_barcodes.barcodes.isin(shared_barcodes)]
     adata = adata[adata.obs.index.isin(shared_barcodes), :]
-    adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ]
+    adata = adata[
+        pd.Categorical(
+            adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True
+        ).argsort(),
+        :,
+    ]
 
     # filter out spots with too small number of UMIs
-    indicator = (np.sum(adata.layers["count"], axis=1) > min_snpumis)
+    indicator = np.sum(adata.layers["count"], axis=1) > min_snpumis
     adata = adata[indicator, :]
     cell_snp_Aallele = cell_snp_Aallele[indicator, :]
     cell_snp_Ballele = cell_snp_Ballele[indicator, :]
 
     # filter out spots with too small number of SNP-covering UMIs
-    indicator = ( np.sum(cell_snp_Aallele, axis=1).A.flatten() + np.sum(cell_snp_Ballele, axis=1).A.flatten() >= min_snpumis )
+    indicator = (
+        np.sum(cell_snp_Aallele, axis=1).A.flatten()
+        + np.sum(cell_snp_Ballele, axis=1).A.flatten()
+        >= min_snpumis
+    )
     adata = adata[indicator, :]
     cell_snp_Aallele = cell_snp_Aallele[indicator, :]
     cell_snp_Ballele = cell_snp_Ballele[indicator, :]
 
     # filter out genes that are expressed in <0.5% cells
-    indicator = (np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0]).A.flatten()
+    indicator = (
+        np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0]
+    ).A.flatten()
     genenames = set(list(adata.var.index[indicator]))
     adata = adata[:, indicator]
     print(adata)
-    print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ))
+    print(
+        "median UMI after filtering out genes < 0.5% of cells = {}".format(
+            np.median(np.sum(adata.layers["count"], axis=1))
+        )
+    )
 
     # remove genes in filtergenelist_file
     # ig_gene_list = pd.read_csv("/n/fs/ragr-data/users/congma/references/cellranger_refdata-gex-GRCh38-2020-A/genes/ig_gene_list.txt", header=None)
     if not filtergenelist_file is None:
         filter_gene_list = pd.read_csv(filtergenelist_file, header=None)
-        filter_gene_list = set(list( filter_gene_list.iloc[:,0] ))
-        indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ])
+        filter_gene_list = set(list(filter_gene_list.iloc[:, 0]))
+        indicator_filter = np.array(
+            [(not x in filter_gene_list) for x in adata.var.index]
+        )
         adata = adata[:, indicator_filter]
-        print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ))
+        print(
+            "median UMI after filtering out genes in filtergenelist_file = {}".format(
+                np.median(np.sum(adata.layers["count"], axis=1))
+            )
+        )
 
     if not filterregion_file is None:
-        regions = pd.read_csv(filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"])
+        regions = pd.read_csv(
+            filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"]
+        )
         if "chr" in regions.Chrname.iloc[0]:
             regions["CHR"] = [int(x[3:]) for x in regions.Chrname.values]
         else:
-            regions.rename(columns={'Chrname':'CHR'}, inplace=True)
+            regions.rename(columns={"Chrname": "CHR"}, inplace=True)
         regions.sort_values(by=["CHR", "Start"], inplace=True)
         indicator_filter = np.array([True] * cell_snp_Aallele.shape[1])
         j = 0
         for i in range(cell_snp_Aallele.shape[1]):
             this_chr = int(unique_snp_ids[i].split("_")[0])
             this_pos = int(unique_snp_ids[i].split("_")[1])
-            while j < regions.shape[0] and ( (regions.CHR.values[j] < this_chr) or ((regions.CHR.values[j] == this_chr) and (regions.End.values[j] <= this_pos)) ):
+            while j < regions.shape[0] and (
+                (regions.CHR.values[j] < this_chr)
+                or (
+                    (regions.CHR.values[j] == this_chr)
+                    and (regions.End.values[j] <= this_pos)
+                )
+            ):
                 j += 1
-            if j < regions.shape[0] and (regions.CHR.values[j] == this_chr) and (regions.Start.values[j] <= this_pos) and (regions.End.values[j] > this_pos):
+            if (
+                j < regions.shape[0]
+                and (regions.CHR.values[j] == this_chr)
+                and (regions.Start.values[j] <= this_pos)
+                and (regions.End.values[j] > this_pos)
+            ):
                 indicator_filter[i] = False
         cell_snp_Aallele = cell_snp_Aallele[:, indicator_filter]
         cell_snp_Ballele = cell_snp_Ballele[:, indicator_filter]
         unique_snp_ids = unique_snp_ids[indicator_filter]
 
     clf = LocalOutlierFactor(n_neighbors=200)
-    label = clf.fit_predict( np.sum(adata.layers["count"], axis=0).reshape(-1,1) )
-    adata.layers["count"][:, np.where(label==-1)[0]] = 0
-    print("filter out {} outlier genes.".format( np.sum(label==-1) ))
+    label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1))
+    adata.layers["count"][:, np.where(label == -1)[0]] = 0
+    print("filter out {} outlier genes.".format(np.sum(label == -1)))
 
     if not normalidx_file is None:
-        normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:,0].values
+        normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values
         adata.obs["tumor_annotation"] = "tumor"
         adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal"
-        print( adata.obs["tumor_annotation"].value_counts() )
-    
+        print(adata.obs["tumor_annotation"].value_counts())
+
     return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids
 
 
-def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_file, filterregion_file, normalidx_file, min_snpumis=50, min_percent_expressed_spots=0.005):
+def load_joint_data(
+    input_filelist,
+    snp_dir,
+    alignment_files,
+    filtergenelist_file,
+    filterregion_file,
+    normalidx_file,
+    min_snpumis=50,
+    min_percent_expressed_spots=0.005,
+):
     ##### read meta sample info #####
     df_meta = pd.read_csv(input_filelist, sep="\t", header=None)
-    df_meta.rename(columns=dict(zip( df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"] )), inplace=True)
+    df_meta.rename(
+        columns=dict(zip(df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"])),
+        inplace=True,
+    )
     logger.info(f"Input spaceranger file list {input_filelist} contains:")
     logger.info(df_meta)
-    df_barcode = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, names=["combined_barcode"])
-    df_barcode["sample_id"] = [x.split("_")[-1] for x in df_barcode.combined_barcode.values]
-    df_barcode["barcode"] = [x.split("_")[0] for x in df_barcode.combined_barcode.values]
+    df_barcode = pd.read_csv(
+        f"{snp_dir}/barcodes.txt", header=None, names=["combined_barcode"]
+    )
+    df_barcode["sample_id"] = [
+        x.split("_")[-1] for x in df_barcode.combined_barcode.values
+    ]
+    df_barcode["barcode"] = [
+        x.split("_")[0] for x in df_barcode.combined_barcode.values
+    ]
     ##### read SNP count #####
     cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz")
     cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz")
     unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True)
-    snp_barcodes = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"])
+    snp_barcodes = pd.read_csv(
+        f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"]
+    )
 
     assert (len(alignment_files) == 0) or (len(alignment_files) + 1 == df_meta.shape[0])
 
     ##### read anndata and coordinate #####
     # add position
     adata = None
-    for i,sname in enumerate(df_meta.sample_id.values):
+    for i, sname in enumerate(df_meta.sample_id.values):
         # locate the corresponding rows in df_meta
         index = np.where(df_barcode["sample_id"] == sname)[0]
         df_this_barcode = copy.copy(df_barcode.iloc[index, :])
         df_this_barcode.index = df_this_barcode.barcode
         # read adata count info
-        if Path(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5").exists():
-            adatatmp = sc.read_10x_h5(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5")
-        elif Path(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad").exists():
-            adatatmp = sc.read_h5ad(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad")
+        if Path(
+            f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5"
+        ).exists():
+            adatatmp = sc.read_10x_h5(
+                f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5"
+            )
+        elif Path(
+            f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad"
+        ).exists():
+            adatatmp = sc.read_h5ad(
+                f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad"
+            )
         else:
-            logging.error(f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!")
+            logging.error(
+                f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!"
+            )
 
         adatatmp.layers["count"] = adatatmp.X.A
         # reorder anndata spots to have the same order as df_this_barcode
-        idx_argsort = pd.Categorical(adatatmp.obs.index, categories=list(df_this_barcode.barcode), ordered=True).argsort()
+        idx_argsort = pd.Categorical(
+            adatatmp.obs.index, categories=list(df_this_barcode.barcode), ordered=True
+        ).argsort()
         adatatmp = adatatmp[idx_argsort, :]
         # read position info
-        if Path(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv").exists():
-            df_this_pos = pd.read_csv(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv", sep=",", header=0, \
-                        names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"])
-        elif Path(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv").exists():
-            df_this_pos = pd.read_csv(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv", sep=",", header=None, \
-                        names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"])
+        if Path(
+            f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv"
+        ).exists():
+            df_this_pos = pd.read_csv(
+                f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv",
+                sep=",",
+                header=0,
+                names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"],
+            )
+        elif Path(
+            f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv"
+        ).exists():
+            df_this_pos = pd.read_csv(
+                f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv",
+                sep=",",
+                header=None,
+                names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"],
+            )
         else:
             raise Exception("No spatial coordinate file!")
         df_this_pos = df_this_pos[df_this_pos.in_tissue == True]
@@ -177,7 +282,9 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil
         df_this_pos = df_this_pos[df_this_pos.barcode.isin(shared_barcodes)]
         #
         # df_this_pos.barcode = pd.Categorical(df_this_pos.barcode, categories=list(df_this_barcode.barcode), ordered=True)
-        df_this_pos.barcode = pd.Categorical(df_this_pos.barcode, categories=list(adatatmp.obs.index), ordered=True)
+        df_this_pos.barcode = pd.Categorical(
+            df_this_pos.barcode, categories=list(adatatmp.obs.index), ordered=True
+        )
         df_this_pos.sort_values(by="barcode", inplace=True)
         adatatmp.obsm["X_pos"] = np.vstack([df_this_pos.x, df_this_pos.y]).T
         adatatmp.obs["sample"] = sname
@@ -197,7 +304,12 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil
     cell_snp_Ballele = cell_snp_Ballele[snp_barcodes.barcodes.isin(shared_barcodes), :]
     snp_barcodes = snp_barcodes[snp_barcodes.barcodes.isin(shared_barcodes)]
     adata = adata[adata.obs.index.isin(shared_barcodes), :]
-    adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ]
+    adata = adata[
+        pd.Categorical(
+            adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True
+        ).argsort(),
+        :,
+    ]
 
     ##### load pairwise alignments #####
     # TBD: directly convert to big "adjacency" matrix
@@ -208,87 +320,132 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil
         col_ind = []
         dat = []
         offset = 0
-        for i,f in enumerate(alignment_files):
+        for i, f in enumerate(alignment_files):
             pi = np.load(f)
             # normalize p such that max( rowsum(pi), colsum(pi) ) = 1, max alignment weight = 1
-            pi = pi / np.max( np.append(np.sum(pi,axis=0), np.sum(pi,axis=1)) )
+            pi = pi / np.max(np.append(np.sum(pi, axis=0), np.sum(pi, axis=1)))
             sname1 = df_meta.sample_id.values[i]
-            sname2 = df_meta.sample_id.values[i+1]
-            assert pi.shape[0] == np.sum(df_barcode["sample_id"] == sname1) # double check whether this is correct
-            assert pi.shape[1] == np.sum(df_barcode["sample_id"] == sname2) # or the dimension should be flipped
+            sname2 = df_meta.sample_id.values[i + 1]
+            assert pi.shape[0] == np.sum(
+                df_barcode["sample_id"] == sname1
+            )  # double check whether this is correct
+            assert pi.shape[1] == np.sum(
+                df_barcode["sample_id"] == sname2
+            )  # or the dimension should be flipped
             # for each spot s in sname1, select {t: spot t in sname2 and pi[s,t] >= np.max(pi[s,:])} as the corresponding spot in the other slice
             for row in range(pi.shape[0]):
-                cutoff = np.max(pi[row,:]) if np.max(pi[row,:]) > EPS else 1+EPS
+                cutoff = np.max(pi[row, :]) if np.max(pi[row, :]) > EPS else 1 + EPS
                 list_cols = np.where(pi[row, :] >= cutoff - EPS)[0]
                 row_ind += [offset + row] * len(list_cols)
-                col_ind += list( offset + pi.shape[0] + list_cols )
+                col_ind += list(offset + pi.shape[0] + list_cols)
                 dat += list(pi[row, list_cols])
             offset += pi.shape[0]
-        across_slice_adjacency_mat = scipy.sparse.csr_matrix((dat, (row_ind, col_ind) ), shape=(adata.shape[0], adata.shape[0]))
+        across_slice_adjacency_mat = scipy.sparse.csr_matrix(
+            (dat, (row_ind, col_ind)), shape=(adata.shape[0], adata.shape[0])
+        )
         across_slice_adjacency_mat += across_slice_adjacency_mat.T
-    
+
     # filter out spots with too small number of UMIs
-    indicator = (np.sum(adata.layers["count"], axis=1) >= min_snpumis)
+    indicator = np.sum(adata.layers["count"], axis=1) >= min_snpumis
     adata = adata[indicator, :]
     cell_snp_Aallele = cell_snp_Aallele[indicator, :]
     cell_snp_Ballele = cell_snp_Ballele[indicator, :]
     if not (across_slice_adjacency_mat is None):
-        across_slice_adjacency_mat = across_slice_adjacency_mat[indicator,:][:,indicator]
+        across_slice_adjacency_mat = across_slice_adjacency_mat[indicator, :][
+            :, indicator
+        ]
 
     # filter out spots with too small number of SNP-covering UMIs
-    indicator = ( np.sum(cell_snp_Aallele, axis=1).A.flatten() + np.sum(cell_snp_Ballele, axis=1).A.flatten() >= min_snpumis )
+    indicator = (
+        np.sum(cell_snp_Aallele, axis=1).A.flatten()
+        + np.sum(cell_snp_Ballele, axis=1).A.flatten()
+        >= min_snpumis
+    )
     adata = adata[indicator, :]
     cell_snp_Aallele = cell_snp_Aallele[indicator, :]
     cell_snp_Ballele = cell_snp_Ballele[indicator, :]
     if not (across_slice_adjacency_mat is None):
-        across_slice_adjacency_mat = across_slice_adjacency_mat[indicator,:][:,indicator]
+        across_slice_adjacency_mat = across_slice_adjacency_mat[indicator, :][
+            :, indicator
+        ]
 
     # filter out genes that are expressed in <min_percent_expressed_spots cells
-    indicator = (np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0]).A.flatten()
+    indicator = (
+        np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0]
+    ).A.flatten()
     genenames = set(list(adata.var.index[indicator]))
     adata = adata[:, indicator]
     print(adata)
-    print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ))
+    print(
+        "median UMI after filtering out genes < 0.5% of cells = {}".format(
+            np.median(np.sum(adata.layers["count"], axis=1))
+        )
+    )
 
     if not filtergenelist_file is None:
         filter_gene_list = pd.read_csv(filtergenelist_file, header=None)
-        filter_gene_list = set(list( filter_gene_list.iloc[:,0] ))
-        indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ])
+        filter_gene_list = set(list(filter_gene_list.iloc[:, 0]))
+        indicator_filter = np.array(
+            [(not x in filter_gene_list) for x in adata.var.index]
+        )
         adata = adata[:, indicator_filter]
-        print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ))
+        print(
+            "median UMI after filtering out genes in filtergenelist_file = {}".format(
+                np.median(np.sum(adata.layers["count"], axis=1))
+            )
+        )
 
     if not filterregion_file is None:
-        regions = pd.read_csv(filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"])
+        regions = pd.read_csv(
+            filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"]
+        )
         if "chr" in regions.Chrname.iloc[0]:
             regions["CHR"] = [int(x[3:]) for x in regions.Chrname.values]
         else:
-            regions.rename(columns={'Chrname':'CHR'}, inplace=True)
+            regions.rename(columns={"Chrname": "CHR"}, inplace=True)
         regions.sort_values(by=["CHR", "Start"], inplace=True)
         indicator_filter = np.array([True] * cell_snp_Aallele.shape[1])
         j = 0
         for i in range(cell_snp_Aallele.shape[1]):
             this_chr = int(unique_snp_ids[i].split("_")[0])
             this_pos = int(unique_snp_ids[i].split("_")[1])
-            while j < regions.shape[0] and ( (regions.CHR.values[j] < this_chr) or ((regions.CHR.values[j] == this_chr) and (regions.End.values[j] <= this_pos)) ):
+            while j < regions.shape[0] and (
+                (regions.CHR.values[j] < this_chr)
+                or (
+                    (regions.CHR.values[j] == this_chr)
+                    and (regions.End.values[j] <= this_pos)
+                )
+            ):
                 j += 1
-            if j < regions.shape[0] and (regions.CHR.values[j] == this_chr) and (regions.Start.values[j] <= this_pos) and (regions.End.values[j] > this_pos):
+            if (
+                j < regions.shape[0]
+                and (regions.CHR.values[j] == this_chr)
+                and (regions.Start.values[j] <= this_pos)
+                and (regions.End.values[j] > this_pos)
+            ):
                 indicator_filter[i] = False
         cell_snp_Aallele = cell_snp_Aallele[:, indicator_filter]
         cell_snp_Ballele = cell_snp_Ballele[:, indicator_filter]
         unique_snp_ids = unique_snp_ids[indicator_filter]
-        
+
     clf = LocalOutlierFactor(n_neighbors=200)
-    label = clf.fit_predict( np.sum(adata.layers["count"], axis=0).reshape(-1,1) )
-    adata.layers["count"][:, np.where(label==-1)[0]] = 0
-    print("filter out {} outlier genes.".format( np.sum(label==-1) ))
+    label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1))
+    adata.layers["count"][:, np.where(label == -1)[0]] = 0
+    print("filter out {} outlier genes.".format(np.sum(label == -1)))
 
     if not normalidx_file is None:
-        normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:,0].values
+        normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values
         adata.obs["tumor_annotation"] = "tumor"
         adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal"
-        print( adata.obs["tumor_annotation"].value_counts() )
+        print(adata.obs["tumor_annotation"].value_counts())
 
-    return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids, across_slice_adjacency_mat
+    return (
+        adata,
+        cell_snp_Aallele.A,
+        cell_snp_Ballele.A,
+        unique_snp_ids,
+        across_slice_adjacency_mat,
+    )
 
 
 def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile):
@@ -296,14 +453,19 @@ def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile):
     cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz")
     unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True)
     barcodes = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, index_col=None)
-    barcodes = barcodes.iloc[:,0].values
+    barcodes = barcodes.iloc[:, 0].values
     # add spatial position
     df_pos = pd.read_csv(bead_file, header=0, sep=",", index_col=None)
     coords = np.vstack([df_pos.xcoord, df_pos.ycoord]).T
     # remove SNPs within filterregion_bedfile
     if not filterregion_bedfile is None:
-        df_filter = pd.read_csv(filterregion_bedfile, header=None, sep="\t", names=["chrname", "start", "end"])
-        df_filter = df_filter[df_filter.chrname.isin( [f"chr{i}" for i in range(1,23)] )]
+        df_filter = pd.read_csv(
+            filterregion_bedfile,
+            header=None,
+            sep="\t",
+            names=["chrname", "start", "end"],
+        )
+        df_filter = df_filter[df_filter.chrname.isin([f"chr{i}" for i in range(1, 23)])]
         df_filter["CHR"] = [int(x[3:]) for x in df_filter.chrname]
         df_filter.sort_values(by=["CHR", "start"])
         # check whether unique_snp_ids are within the regions in df_filter
@@ -315,9 +477,15 @@ def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile):
         is_within_filterregion = []
         j = 0
         for i in range(len(unique_snp_ids)):
-            while (filter_chrs[j] < snp_chrs[i]) or ((filter_chrs[j] == snp_chrs[i]) and (filter_end[j] < snp_pos[i])):
+            while (filter_chrs[j] < snp_chrs[i]) or (
+                (filter_chrs[j] == snp_chrs[i]) and (filter_end[j] < snp_pos[i])
+            ):
                 j += 1
-            if filter_chrs[j] == snp_chrs[i] and filter_start[j] <= snp_pos[i] and filter_end[j] >= snp_pos[i]:
+            if (
+                filter_chrs[j] == snp_chrs[i]
+                and filter_start[j] <= snp_pos[i]
+                and filter_end[j] >= snp_pos[i]
+            ):
                 is_within_filterregion.append(True)
             else:
                 is_within_filterregion.append(False)
@@ -329,45 +497,88 @@ def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile):
     return coords, cell_snp_Aallele, cell_snp_Ballele, barcodes, unique_snp_ids
 
 
-def taking_shared_barcodes(snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, adata, df_pos):
+def taking_shared_barcodes(
+    snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, adata, df_pos
+):
     # shared barcodes between adata and SNPs
-    shared_barcodes = set(list(snp_barcodes.barcodes)) & set(list(adata.obs.index)) & set(list(df_pos.barcode))
+    shared_barcodes = (
+        set(list(snp_barcodes.barcodes))
+        & set(list(adata.obs.index))
+        & set(list(df_pos.barcode))
+    )
     cell_snp_Aallele = cell_snp_Aallele[snp_barcodes.barcodes.isin(shared_barcodes), :]
     cell_snp_Ballele = cell_snp_Ballele[snp_barcodes.barcodes.isin(shared_barcodes), :]
     snp_barcodes = snp_barcodes[snp_barcodes.barcodes.isin(shared_barcodes)]
     adata = adata[adata.obs.index.isin(shared_barcodes), :]
-    adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ]
+    adata = adata[
+        pd.Categorical(
+            adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True
+        ).argsort(),
+        :,
+    ]
     df_pos = df_pos[df_pos.barcode.isin(shared_barcodes)]
-    df_pos = df_pos.iloc[ pd.Categorical(df_pos.barcode, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ]
+    df_pos = df_pos.iloc[
+        pd.Categorical(
+            df_pos.barcode, categories=list(snp_barcodes.barcodes), ordered=True
+        ).argsort(),
+        :,
+    ]
     return snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, adata, df_pos
 
 
-def filter_genes_barcodes_hatchetblock(adata, cell_snp_Aallele, cell_snp_Ballele, snp_barcodes, unique_snp_ids, config, min_umi=100, min_spot_percent=0.005, ordered_chr=[str(c) for c in range(1,23)]):
+def filter_genes_barcodes_hatchetblock(
+    adata,
+    cell_snp_Aallele,
+    cell_snp_Ballele,
+    snp_barcodes,
+    unique_snp_ids,
+    config,
+    min_umi=100,
+    min_spot_percent=0.005,
+    ordered_chr=[str(c) for c in range(1, 23)],
+):
     # filter out spots with too small number of UMIs
-    indicator = (np.sum(adata.layers["count"], axis=1) > min_umi)
+    indicator = np.sum(adata.layers["count"], axis=1) > min_umi
     adata = adata[indicator, :]
     cell_snp_Aallele = cell_snp_Aallele[indicator, :]
     cell_snp_Ballele = cell_snp_Ballele[indicator, :]
 
     # filter out genes that are expressed in <0.5% cells
-    indicator = (np.sum(adata.X > 0, axis=0) >= min_spot_percent * adata.shape[0]).A.flatten()
+    indicator = (
+        np.sum(adata.X > 0, axis=0) >= min_spot_percent * adata.shape[0]
+    ).A.flatten()
     genenames = set(list(adata.var.index[indicator]))
     adata = adata[:, indicator]
     print(adata)
-    print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ))
+    print(
+        "median UMI after filtering out genes < 0.5% of cells = {}".format(
+            np.median(np.sum(adata.layers["count"], axis=1))
+        )
+    )
 
     if not config["filtergenelist_file"] is None:
         filter_gene_list = pd.read_csv(config["filtergenelist_file"], header=None)
-        filter_gene_list = set(list( filter_gene_list.iloc[:,0] ))
-        indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ])
+        filter_gene_list = set(list(filter_gene_list.iloc[:, 0]))
+        indicator_filter = np.array(
+            [(not x in filter_gene_list) for x in adata.var.index]
+        )
         adata = adata[:, indicator_filter]
-        print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ))
+        print(
+            "median UMI after filtering out genes in filtergenelist_file = {}".format(
+                np.median(np.sum(adata.layers["count"], axis=1))
+            )
+        )
 
     if not config["filterregion_file"] is None:
-        regions = pd.read_csv(config["filterregion_file"], header=None, sep="\t", names=["Chrname", "Start", "End"])
-        ordered_chr_map = {ordered_chr[i]:i for i in range(len(ordered_chr))}
+        regions = pd.read_csv(
+            config["filterregion_file"],
+            header=None,
+            sep="\t",
+            names=["Chrname", "Start", "End"],
+        )
+        ordered_chr_map = {ordered_chr[i]: i for i in range(len(ordered_chr))}
         # retain only chromosomes in ordered_chr
-        if ~np.any( regions.Chrname.isin(ordered_chr) ):
+        if ~np.any(regions.Chrname.isin(ordered_chr)):
             regions["Chrname"] = regions.Chrname.map(lambda x: x.replace("chr", ""))
         regions = regions[regions.Chrname.isin(ordered_chr)]
         regions["int_chrom"] = regions.Chrname.map(ordered_chr_map)
@@ -377,9 +588,20 @@ def filter_genes_barcodes_hatchetblock(adata, cell_snp_Aallele, cell_snp_Ballele
         for i in range(cell_snp_Aallele.shape[1]):
             this_chr = int(unique_snp_ids[i].split("_")[0])
             this_pos = int(unique_snp_ids[i].split("_")[1])
-            while j < regions.shape[0] and ( (regions.int_chrom.values[j] < this_chr) or ((regions.int_chrom.values[j] == this_chr) and (regions.End.values[j] <= this_pos)) ):
+            while j < regions.shape[0] and (
+                (regions.int_chrom.values[j] < this_chr)
+                or (
+                    (regions.int_chrom.values[j] == this_chr)
+                    and (regions.End.values[j] <= this_pos)
+                )
+            ):
                 j += 1
-            if j < regions.shape[0] and (regions.int_chrom.values[j] == this_chr) and (regions.Start.values[j] <= this_pos) and (regions.End.values[j] > this_pos):
+            if (
+                j < regions.shape[0]
+                and (regions.int_chrom.values[j] == this_chr)
+                and (regions.Start.values[j] <= this_pos)
+                and (regions.End.values[j] > this_pos)
+            ):
                 indicator_filter[i] = False
         cell_snp_Aallele = cell_snp_Aallele[:, indicator_filter]
         cell_snp_Ballele = cell_snp_Ballele[:, indicator_filter]
@@ -393,7 +615,7 @@ def read_bias_correction_info(bc_file):
         df_info = pd.read_csv(bc_file, header=None, sep="\t")
     except:
         df_info = pd.read_csv(bc_file, header=0, sep="\t")
-    return df_info.iloc[:,-1].values
+    return df_info.iloc[:, -1].values
 
 
 def binning_readcount_using_SNP(df_bins, sorted_chr_pos_first):
@@ -413,37 +635,62 @@ def binning_readcount_using_SNP(df_bins, sorted_chr_pos_first):
         # move the cursort on sorted_chr_pos_first such that the chr matches that in df_bins
         while this_chr != sorted_chr_pos_first[idx][0]:
             idx += 1
-        while idx + 1 < len(sorted_chr_pos_first) and this_chr == sorted_chr_pos_first[idx+1][0] and mid > sorted_chr_pos_first[idx+1][1]:
+        while (
+            idx + 1 < len(sorted_chr_pos_first)
+            and this_chr == sorted_chr_pos_first[idx + 1][0]
+            and mid > sorted_chr_pos_first[idx + 1][1]
+        ):
             idx += 1
         multiplier[i, idx] = 1
     return multiplier
-    
 
-def load_slidedna_readcount(countfile, bead_file, binfile, normalfile, bias_correction_filelist, retained_barcodes, retain_chr_list=np.arange(1,23)):
+
+def load_slidedna_readcount(
+    countfile,
+    bead_file,
+    binfile,
+    normalfile,
+    bias_correction_filelist,
+    retained_barcodes,
+    retain_chr_list=np.arange(1, 23),
+):
     # load counts and the corresponding barcodes per spot in counts
     tmpcounts = np.loadtxt(countfile)
-    counts = scipy.sparse.csr_matrix(( tmpcounts[:,2], (tmpcounts[:,0].astype(int)-1, tmpcounts[:,1].astype(int)-1) ))
+    counts = scipy.sparse.csr_matrix(
+        (
+            tmpcounts[:, 2],
+            (tmpcounts[:, 0].astype(int) - 1, tmpcounts[:, 1].astype(int) - 1),
+        )
+    )
     tmpdf = pd.read_csv(bead_file, header=0, sep=",", index_col=0)
-    tmpdf = tmpdf.join( pd.DataFrame(counts.A, index=tmpdf.index))
+    tmpdf = tmpdf.join(pd.DataFrame(counts.A, index=tmpdf.index))
     # keep only the spots in retained_barcodes
     tmpdf = tmpdf[tmpdf.index.isin(retained_barcodes)]
     # reorder by retained_barcodes
-    tmpdf.index = pd.Categorical(tmpdf.index, categories=retained_barcodes, ordered=True)
+    tmpdf.index = pd.Categorical(
+        tmpdf.index, categories=retained_barcodes, ordered=True
+    )
     tmpdf.sort_index(inplace=True)
     counts = tmpdf.values[:, 2:]
 
     # load normal counts
-    normal_cov = pd.read_csv(normalfile, header=None, sep="\t").values[:,-1].astype(float)
+    normal_cov = (
+        pd.read_csv(normalfile, header=None, sep="\t").values[:, -1].astype(float)
+    )
 
     # load bin info
     df_bins = pd.read_csv(binfile, comment="#", header=None, index_col=None, sep="\t")
     old_names = df_bins.columns[:3]
     df_bins.rename(columns=dict(zip(old_names, ["CHR", "START", "END"])), inplace=True)
-    
+
     # select bins according to retain_chr_list
-    retain_chr_list_append = list(retain_chr_list) + [str(x) for x in retain_chr_list] + [f"chr{x}" for x in retain_chr_list]
+    retain_chr_list_append = (
+        list(retain_chr_list)
+        + [str(x) for x in retain_chr_list]
+        + [f"chr{x}" for x in retain_chr_list]
+    )
     bidx = np.where(df_bins.CHR.isin(retain_chr_list_append))[0]
-    df_bins = df_bins.iloc[bidx,:]
+    df_bins = df_bins.iloc[bidx, :]
     counts = counts[:, bidx]
     normal_cov = normal_cov[bidx]
 
@@ -458,40 +705,75 @@ def load_slidedna_readcount(countfile, bead_file, binfile, normalfile, bias_corr
     bias_features = []
     for f in bias_correction_filelist:
         this_feature = read_bias_correction_info(f)
-        bias_features.append( this_feature[bidx] )
+        bias_features.append(this_feature[bidx])
     bias_features = np.array(bias_features).T
     # kernel ridge regression to predict the read count per bin
     # the prediction serves as a baseline of the expected read count, and plays a role in base_nb_mean
     krr = KernelRidge(alpha=0.2)
-    krr.fit( bias_features, np.sum(counts, axis=0) / np.sum(counts) )
-    pred = krr.predict( bias_features )
+    krr.fit(bias_features, np.sum(counts, axis=0) / np.sum(counts))
+    pred = krr.predict(bias_features)
 
     # single_base_nb_mean from bias correction + expected normal
-    single_base_nb_mean = (pred * normal_cov).reshape(-1,1) / np.sum(pred * normal_cov) * np.sum(counts, axis=1).reshape(1,-1)
+    single_base_nb_mean = (
+        (pred * normal_cov).reshape(-1, 1)
+        / np.sum(pred * normal_cov)
+        * np.sum(counts, axis=1).reshape(1, -1)
+    )
     # single_base_nb_mean = pred.reshape(-1,1) / np.sum(pred) * np.sum(counts, axis=1).reshape(1,-1)
 
     # remove too low baseline
-    threshold = np.median( np.sum(single_base_nb_mean, axis=1) / df_bins.iloc[:,3].values.astype(float) ) * 0.5
-    idx_filter = np.where( np.sum(single_base_nb_mean, axis=1) / df_bins.iloc[:,3].values.astype(float) < threshold )[0]
+    threshold = (
+        np.median(
+            np.sum(single_base_nb_mean, axis=1)
+            / df_bins.iloc[:, 3].values.astype(float)
+        )
+        * 0.5
+    )
+    idx_filter = np.where(
+        np.sum(single_base_nb_mean, axis=1) / df_bins.iloc[:, 3].values.astype(float)
+        < threshold
+    )[0]
     single_base_nb_mean[idx_filter, :] = 0
     counts[:, idx_filter] = 0
 
     return counts, single_base_nb_mean, df_bins, normal_cov
 
-    
-def get_slidednaseq_rdr(countfile, bead_file, binfile, normalfile, bias_correction_filelist, retained_barcodes, sorted_chr_pos_first, single_X, single_base_nb_mean, retain_chr_list=np.arange(1,23)):
-    counts, single_base_nb_mean, df_bins, _ = load_slidedna_readcount(countfile, bead_file, binfile, normalfile, bias_correction_filelist, retained_barcodes)
+
+def get_slidednaseq_rdr(
+    countfile,
+    bead_file,
+    binfile,
+    normalfile,
+    bias_correction_filelist,
+    retained_barcodes,
+    sorted_chr_pos_first,
+    single_X,
+    single_base_nb_mean,
+    retain_chr_list=np.arange(1, 23),
+):
+    counts, single_base_nb_mean, df_bins, _ = load_slidedna_readcount(
+        countfile,
+        bead_file,
+        binfile,
+        normalfile,
+        bias_correction_filelist,
+        retained_barcodes,
+    )
     # remove bins with low-coverage single_base_nb_mean
-    
+
     multiplier = binning_readcount_using_SNP(df_bins, sorted_chr_pos_first)
-    single_X[:,0,:] = multiplier.T @ counts.T
+    single_X[:, 0, :] = multiplier.T @ counts.T
     single_base_nb_mean = multiplier.T @ single_base_nb_mean
     return single_X, single_base_nb_mean
 
 
-def filter_slidedna_spot_by_adjacency(coords, cell_snp_Aallele, cell_snp_Ballele, barcodes):
+def filter_slidedna_spot_by_adjacency(
+    coords, cell_snp_Aallele, cell_snp_Ballele, barcodes
+):
     # distance to center
-    dist = np.sqrt(np.sum(np.square(coords - np.median(coords, axis=0, keepdims=True)), axis=1))
+    dist = np.sqrt(
+        np.sum(np.square(coords - np.median(coords, axis=0, keepdims=True)), axis=1)
+    )
     idx_keep = np.where(dist < 2500)[0]
     # remove spots
     coords = coords[idx_keep, :]
@@ -504,15 +786,38 @@ def filter_slidedna_spot_by_adjacency(coords, cell_snp_Aallele, cell_snp_Ballele
 def combine_gene_snps(unique_snp_ids, hgtable_file, adata):
     # read gene info and keep only chr1-chr22 and genes appearing in adata
     df_hgtable = pd.read_csv(hgtable_file, header=0, index_col=0, sep="\t")
-    df_hgtable = df_hgtable[df_hgtable.chrom.isin( [f"chr{i}" for i in range(1, 23)] )]
+    df_hgtable = df_hgtable[df_hgtable.chrom.isin([f"chr{i}" for i in range(1, 23)])]
     df_hgtable = df_hgtable[df_hgtable.name2.isin(adata.var.index)]
     # a data frame including both gene and SNP info: CHR, START, END, snp_id, gene, is_interval
-    df_gene_snp = pd.DataFrame({"CHR":[int(x[3:]) for x in df_hgtable.chrom.values], "START":df_hgtable.cdsStart.values, "END":df_hgtable.cdsEnd.values, \
-                                "snp_id":None, "gene":df_hgtable.name2.values, "is_interval":True})
+    df_gene_snp = pd.DataFrame(
+        {
+            "CHR": [int(x[3:]) for x in df_hgtable.chrom.values],
+            "START": df_hgtable.cdsStart.values,
+            "END": df_hgtable.cdsEnd.values,
+            "snp_id": None,
+            "gene": df_hgtable.name2.values,
+            "is_interval": True,
+        }
+    )
     # add SNP info
     snp_chr = np.array([int(x.split("_")[0]) for x in unique_snp_ids])
     snp_pos = np.array([int(x.split("_")[1]) for x in unique_snp_ids])
-    df_gene_snp = pd.concat([df_gene_snp, pd.DataFrame({"CHR":snp_chr, "START":snp_pos, "END":snp_pos+1, "snp_id":unique_snp_ids, "gene":None, "is_interval":False}) ], ignore_index=True)
+    df_gene_snp = pd.concat(
+        [
+            df_gene_snp,
+            pd.DataFrame(
+                {
+                    "CHR": snp_chr,
+                    "START": snp_pos,
+                    "END": snp_pos + 1,
+                    "snp_id": unique_snp_ids,
+                    "gene": None,
+                    "is_interval": False,
+                }
+            ),
+        ],
+        ignore_index=True,
+    )
     df_gene_snp.sort_values(by=["CHR", "START"], inplace=True)
 
     # check the what gene each SNP belongs to
@@ -526,18 +831,29 @@ def combine_gene_snps(unique_snp_ids, hgtable_file, adata):
             continue
         this_pos = vec_start[i]
         j = i - 1
-        while j >= 0 and j >= i-50 and vec_chr[i] == vec_chr[j]:
-            if vec_is_interval[j] and vec_start[j] <= this_pos and vec_end[j] > this_pos:
+        while j >= 0 and j >= i - 50 and vec_chr[i] == vec_chr[j]:
+            if (
+                vec_is_interval[j]
+                and vec_start[j] <= this_pos
+                and vec_end[j] > this_pos
+            ):
                 df_gene_snp.iloc[i, 4] = df_gene_snp.iloc[j]["gene"]
                 break
             j -= 1
-    
+
     # remove SNPs that have no corresponding genes
     df_gene_snp = df_gene_snp[~df_gene_snp.gene.isnull()]
     return df_gene_snp
 
 
-def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, initial_min_umi=15):
+def create_haplotype_block_ranges(
+    df_gene_snp,
+    adata,
+    cell_snp_Aallele,
+    cell_snp_Ballele,
+    unique_snp_ids,
+    initial_min_umi=15,
+):
     """
     Initially block SNPs along genome.
 
@@ -547,73 +863,124 @@ def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp
         Gene and SNP info combined into a single data frame sorted by genomic positions. "is_interval" suggest whether the entry is a gene or a SNP. "gene" column either contain gene name if the entry is a gene, or the gene a SNP belongs to if the entry is a SNP.
     """
     # first level: partition of genome: by gene regions (if two genes overlap, they are grouped to one region)
-    tmp_block_genome_intervals = list(zip( df_gene_snp[df_gene_snp.is_interval].CHR.values, df_gene_snp[df_gene_snp.is_interval].START.values, df_gene_snp[df_gene_snp.is_interval].END.values ))
+    tmp_block_genome_intervals = list(
+        zip(
+            df_gene_snp[df_gene_snp.is_interval].CHR.values,
+            df_gene_snp[df_gene_snp.is_interval].START.values,
+            df_gene_snp[df_gene_snp.is_interval].END.values,
+        )
+    )
     block_genome_intervals = [tmp_block_genome_intervals[0]]
     for x in tmp_block_genome_intervals[1:]:
         # check whether overlap with previous block
-        if x[0] == block_genome_intervals[-1][0] and max(x[1], block_genome_intervals[-1][1]) < min(x[2], block_genome_intervals[-1][2]):
-            block_genome_intervals[-1] = (x[0], min(x[1], block_genome_intervals[-1][1]), max(x[2], block_genome_intervals[-1][2]))
+        if x[0] == block_genome_intervals[-1][0] and max(
+            x[1], block_genome_intervals[-1][1]
+        ) < min(x[2], block_genome_intervals[-1][2]):
+            block_genome_intervals[-1] = (
+                x[0],
+                min(x[1], block_genome_intervals[-1][1]),
+                max(x[2], block_genome_intervals[-1][2]),
+            )
         else:
             block_genome_intervals.append(x)
     # get block_ranges in the index of df_gene_snp
     block_ranges = []
     for x in block_genome_intervals:
-        indexes = np.where((df_gene_snp.CHR.values == x[0]) & \
-                           (np.maximum(df_gene_snp.START.values, x[1]) < np.minimum(df_gene_snp.END.values, x[2])) )[0]
-        block_ranges.append( (indexes[0], indexes[-1]+1) )
-    assert np.all( np.array(np.array([x[1] for x in block_ranges[:-1]])) == np.array(np.array([x[0] for x in block_ranges[1:]])) )
+        indexes = np.where(
+            (df_gene_snp.CHR.values == x[0])
+            & (
+                np.maximum(df_gene_snp.START.values, x[1])
+                < np.minimum(df_gene_snp.END.values, x[2])
+            )
+        )[0]
+        block_ranges.append((indexes[0], indexes[-1] + 1))
+    assert np.all(
+        np.array(np.array([x[1] for x in block_ranges[:-1]]))
+        == np.array(np.array([x[0] for x in block_ranges[1:]]))
+    )
     # record the initial block id in df_gene_snps
     df_gene_snp["initial_block_id"] = 0
-    for i,x in enumerate(block_ranges):
-        df_gene_snp.iloc[x[0]:x[1], -1] = i
+    for i, x in enumerate(block_ranges):
+        df_gene_snp.iloc[x[0] : x[1], -1] = i
 
     # second level: group the first level blocks into haplotype blocks such that the minimum SNP-covering UMI counts >= initial_min_umi
-    map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)}
-    initial_block_chr = df_gene_snp.CHR.values[ np.array([x[0] for x in block_ranges]) ]
+    map_snp_index = {x: i for i, x in enumerate(unique_snp_ids)}
+    initial_block_chr = df_gene_snp.CHR.values[np.array([x[0] for x in block_ranges])]
     block_ranges_new = []
     s = 0
     while s < len(block_ranges):
         t = s
         while t <= len(block_ranges):
             t += 1
-            reach_end = (t == len(block_ranges))
-            change_chr = (initial_block_chr[s] != initial_block_chr[t-1])
+            reach_end = t == len(block_ranges)
+            change_chr = initial_block_chr[s] != initial_block_chr[t - 1]
             # count SNP-covering UMI
-            involved_snps_ids = df_gene_snp[ (df_gene_snp.initial_block_id>=s) & (df_gene_snp.initial_block_id<t) ].snp_id
+            involved_snps_ids = df_gene_snp[
+                (df_gene_snp.initial_block_id >= s) & (df_gene_snp.initial_block_id < t)
+            ].snp_id
             involved_snps_ids = involved_snps_ids[~involved_snps_ids.isnull()].values
             involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids])
-            this_snp_umis = 0 if len(involved_snp_idx) == 0 else np.sum(cell_snp_Aallele[:, involved_snp_idx]) + np.sum(cell_snp_Ballele[:, involved_snp_idx])
+            this_snp_umis = (
+                0
+                if len(involved_snp_idx) == 0
+                else np.sum(cell_snp_Aallele[:, involved_snp_idx])
+                + np.sum(cell_snp_Ballele[:, involved_snp_idx])
+            )
             if reach_end:
                 break
             if change_chr:
                 t -= 1
                 # re-count SNP-covering UMIs
-                involved_snps_ids = df_gene_snp.snp_id.iloc[block_ranges[s][0]:block_ranges[t-1][1]]
-                involved_snps_ids = involved_snps_ids[~involved_snps_ids.isnull()].values
-                involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids])
-                this_snp_umis = 0 if len(involved_snp_idx) == 0 else np.sum( cell_snp_Aallele[:, involved_snp_idx]) + np.sum(cell_snp_Ballele[:, involved_snp_idx])
+                involved_snps_ids = df_gene_snp.snp_id.iloc[
+                    block_ranges[s][0] : block_ranges[t - 1][1]
+                ]
+                involved_snps_ids = involved_snps_ids[
+                    ~involved_snps_ids.isnull()
+                ].values
+                involved_snp_idx = np.array(
+                    [map_snp_index[x] for x in involved_snps_ids]
+                )
+                this_snp_umis = (
+                    0
+                    if len(involved_snp_idx) == 0
+                    else np.sum(cell_snp_Aallele[:, involved_snp_idx])
+                    + np.sum(cell_snp_Ballele[:, involved_snp_idx])
+                )
                 break
             if this_snp_umis >= initial_min_umi:
                 break
         #
-        if this_snp_umis < initial_min_umi and s > 0 and initial_block_chr[s-1] == initial_block_chr[s]:
+        if (
+            this_snp_umis < initial_min_umi
+            and s > 0
+            and initial_block_chr[s - 1] == initial_block_chr[s]
+        ):
             indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0]
-            block_ranges_new[-1] = (block_ranges_new[-1][0], indexes[-1]+1)
+            block_ranges_new[-1] = (block_ranges_new[-1][0], indexes[-1] + 1)
         else:
             indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0]
-            block_ranges_new.append( (indexes[0], indexes[-1]+1) )
+            block_ranges_new.append((indexes[0], indexes[-1] + 1))
         s = t
-    
+
     # record the block id in df_gene_snps
     df_gene_snp["block_id"] = 0
-    for i,x in enumerate(block_ranges_new):
-        df_gene_snp.iloc[x[0]:x[1], -1] = i
+    for i, x in enumerate(block_ranges_new):
+        df_gene_snp.iloc[x[0] : x[1], -1] = i
 
     df_gene_snp = df_gene_snp.drop(columns=["initial_block_id"])
     return df_gene_snp
 
 
-def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu, logphase_shift, geneticmap_file):
+def summarize_counts_for_blocks(
+    df_gene_snp,
+    adata,
+    cell_snp_Aallele,
+    cell_snp_Ballele,
+    unique_snp_ids,
+    nu,
+    logphase_shift,
+    geneticmap_file,
+):
     """
     Attributes:
     ----------
@@ -630,7 +997,7 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B
 
     single_base_nb_mean : array, (n_blocks, n_spots)
         Baseline transcript counts in normal diploid per block per cell.
-    
+
     single_total_bb_RD : array, (n_blocks, n_spots)
         Total allele count per block per cell.
 
@@ -642,42 +1009,76 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B
     single_base_nb_mean = np.zeros((len(blocks), adata.shape[0]))
     single_total_bb_RD = np.zeros((len(blocks), adata.shape[0]), dtype=int)
     # summarize counts of involved genes and SNPs within each block
-    map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)}
-    df_block_contents = df_gene_snp.groupby('block_id').agg({"snp_id":list, "gene":list})
+    map_snp_index = {x: i for i, x in enumerate(unique_snp_ids)}
+    df_block_contents = df_gene_snp.groupby("block_id").agg(
+        {"snp_id": list, "gene": list}
+    )
     for b in range(df_block_contents.shape[0]):
         # BAF (SNPs)
-        involved_snps_ids = [x for x in df_block_contents.snp_id.values[b] if not x is None]
+        involved_snps_ids = [
+            x for x in df_block_contents.snp_id.values[b] if not x is None
+        ]
         involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids])
         if len(involved_snp_idx) > 0:
-            single_X[b, 1, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 )
-            single_total_bb_RD[b, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ) + np.sum( cell_snp_Ballele[:, involved_snp_idx], axis=1 )
+            single_X[b, 1, :] = np.sum(cell_snp_Aallele[:, involved_snp_idx], axis=1)
+            single_total_bb_RD[b, :] = np.sum(
+                cell_snp_Aallele[:, involved_snp_idx], axis=1
+            ) + np.sum(cell_snp_Ballele[:, involved_snp_idx], axis=1)
         # RDR (genes)
-        involved_genes = list(set([x for x in df_block_contents.gene.values[b] if not x is None]))
+        involved_genes = list(
+            set([x for x in df_block_contents.gene.values[b] if not x is None])
+        )
         if len(involved_genes) > 0:
-            single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 )
+            single_X[b, 0, :] = np.sum(
+                adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1
+            )
 
     # lengths
     lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int)
-    for i,c in enumerate( df_gene_snp.CHR.unique() ):
-        lengths[i] = len( df_gene_snp[df_gene_snp.CHR == c].block_id.unique() )
+    for i, c in enumerate(df_gene_snp.CHR.unique()):
+        lengths[i] = len(df_gene_snp[df_gene_snp.CHR == c].block_id.unique())
 
     # phase switch probability from genetic distance
-    sorted_chr_pos_first = df_gene_snp.groupby('block_id').agg({'CHR': 'first', 'START': 'first'})
-    sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values))
-    sorted_chr_pos_last = df_gene_snp.groupby('block_id').agg({'CHR': 'last', 'END': 'last'})
-    sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values))
+    sorted_chr_pos_first = df_gene_snp.groupby("block_id").agg(
+        {"CHR": "first", "START": "first"}
+    )
+    sorted_chr_pos_first = list(
+        zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)
+    )
+    sorted_chr_pos_last = df_gene_snp.groupby("block_id").agg(
+        {"CHR": "last", "END": "last"}
+    )
+    sorted_chr_pos_last = list(
+        zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)
+    )
     #
-    tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair]
-    position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file )
-    phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu)
-    log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift)
+    tmp_sorted_chr_pos = [
+        val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair
+    ]
+    position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file)
+    phase_switch_prob = compute_phase_switch_probability_position(
+        position_cM, tmp_sorted_chr_pos, nu
+    )
+    log_sitewise_transmat = np.minimum(
+        np.log(0.5), np.log(phase_switch_prob) - logphase_shift
+    )
     # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)]
-    log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)]
-
-    return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat
-
-
-def choose_umithreshold_given_nbins(single_total_bb_RD, refined_lengths, expected_nbins):
+    log_sitewise_transmat = log_sitewise_transmat[
+        np.arange(1, len(log_sitewise_transmat), 2)
+    ]
+
+    return (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+    )
+
+
+def choose_umithreshold_given_nbins(
+    single_total_bb_RD, refined_lengths, expected_nbins
+):
     def count_num_bins(per_snp_umi, refined_lengths, secondary_min_umi):
         cumlen = 0
         s = 0
@@ -693,6 +1094,7 @@ def count_num_bins(per_snp_umi, refined_lengths, secondary_min_umi):
                 s = t
             cumlen += le
         return bin_counter
+
     per_snp_umi = np.sum(single_total_bb_RD, axis=1)
     # candicate range
     lo = np.sort(per_snp_umi)[-expected_nbins]
@@ -711,7 +1113,25 @@ def count_num_bins(per_snp_umi, refined_lengths, secondary_min_umi):
     return mid
 
 
-def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, binsize, rdrbinsize, nu, logphase_shift, geneticmap_file, secondary_min_umi=1000, max_binlength=5e6):
+def perform_binning_new(
+    lengths,
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    sorted_chr_pos,
+    sorted_chr_pos_last,
+    x_gene_list,
+    n_snps,
+    phase_indicator,
+    refined_lengths,
+    binsize,
+    rdrbinsize,
+    nu,
+    logphase_shift,
+    geneticmap_file,
+    secondary_min_umi=1000,
+    max_binlength=5e6,
+):
     per_snp_umi = np.sum(single_total_bb_RD, axis=1)
     # secondary_min_umi = np.percentile(per_snp_umi, secondary_percentile)
     # bin both RDR and BAF
@@ -731,12 +1151,15 @@ def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_
             t = s + 1
             while t < cumlen + le and np.sum(per_snp_umi[s:t]) < secondary_min_umi:
                 t += 1
-                if sorted_chr_pos_last[t-1][1] - sorted_chr_pos[s][1] >= max_binlength:
-                    t = max(t-1, s+1)
+                if (
+                    sorted_chr_pos_last[t - 1][1] - sorted_chr_pos[s][1]
+                    >= max_binlength
+                ):
+                    t = max(t - 1, s + 1)
                     break
             # expand binsize by minimum number of genes
-            this_genes = sum([ x_gene_list[i].split(" ") for i in range(s,t) ], [])
-            this_genes = [z for z in this_genes if z!=""]
+            this_genes = sum([x_gene_list[i].split(" ") for i in range(s, t)], [])
+            this_genes = [z for z in this_genes if z != ""]
             idx_A = np.where(phase_indicator[s:t])[0]
             idx_B = np.where(~phase_indicator[s:t])[0]
             # if np.sum(per_snp_umi[s:t]) >= secondary_min_umi or sorted_chr_pos[s][0] != bin_sorted_chr_pos_last[-1][0]:
@@ -749,36 +1172,65 @@ def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_
             #     bin_x_gene_list.append( " ".join(this_genes) )
             #     bin_n_snps.append( np.sum(n_snps[s:t]) )
             # else:
-            #     bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0) 
+            #     bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0)
             #     bin_single_X_baf[-1] += np.sum(single_X[s:t, 1, :][idx_A,:], axis=0) + np.sum(single_total_bb_RD[s:t, :][idx_B,:] - single_X[s:t, 1, :][idx_B,:], axis=0)
             #     bin_single_base_nb_mean[-1] += np.sum(single_base_nb_mean[s:t, :], axis=0)
             #     bin_single_total_bb_RD[-1] += np.sum(single_total_bb_RD[s:t, :], axis=0)
             #     bin_sorted_chr_pos_last[-1] = sorted_chr_pos_last[t-1]
             #     if len(this_genes) > 0:
             #         bin_x_gene_list[-1] += " " +  " ".join(this_genes)
-            #     bin_n_snps[-1] += np.sum(n_snps[s:t])                
-            if len(bin_sorted_chr_pos_last) > 0 and sorted_chr_pos[s][0] == bin_sorted_chr_pos_last[-1][0] and \
-            np.sum(per_snp_umi[s:t]) < 0.5*secondary_min_umi and sorted_chr_pos_last[t-1][1] - sorted_chr_pos[s][1] < 0.5*max_binlength:
-                bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0) 
-                bin_single_X_baf[-1] += np.sum(single_X[s:t, 1, :][idx_A,:], axis=0) + np.sum(single_total_bb_RD[s:t, :][idx_B,:] - single_X[s:t, 1, :][idx_B,:], axis=0)
-                bin_single_base_nb_mean[-1] += np.sum(single_base_nb_mean[s:t, :], axis=0)
+            #     bin_n_snps[-1] += np.sum(n_snps[s:t])
+            if (
+                len(bin_sorted_chr_pos_last) > 0
+                and sorted_chr_pos[s][0] == bin_sorted_chr_pos_last[-1][0]
+                and np.sum(per_snp_umi[s:t]) < 0.5 * secondary_min_umi
+                and sorted_chr_pos_last[t - 1][1] - sorted_chr_pos[s][1]
+                < 0.5 * max_binlength
+            ):
+                bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0)
+                bin_single_X_baf[-1] += np.sum(
+                    single_X[s:t, 1, :][idx_A, :], axis=0
+                ) + np.sum(
+                    single_total_bb_RD[s:t, :][idx_B, :]
+                    - single_X[s:t, 1, :][idx_B, :],
+                    axis=0,
+                )
+                bin_single_base_nb_mean[-1] += np.sum(
+                    single_base_nb_mean[s:t, :], axis=0
+                )
                 bin_single_total_bb_RD[-1] += np.sum(single_total_bb_RD[s:t, :], axis=0)
-                bin_sorted_chr_pos_last[-1] = sorted_chr_pos_last[t-1]
+                bin_sorted_chr_pos_last[-1] = sorted_chr_pos_last[t - 1]
                 if len(this_genes) > 0:
-                    bin_x_gene_list[-1] += " " +  " ".join(this_genes)
+                    bin_x_gene_list[-1] += " " + " ".join(this_genes)
                 bin_n_snps[-1] += np.sum(n_snps[s:t])
             else:
-                bin_single_X_rdr.append( np.sum(single_X[s:t, 0, :], axis=0) )
-                bin_single_X_baf.append( np.sum(single_X[s:t, 1, :][idx_A,:], axis=0) + np.sum(single_total_bb_RD[s:t, :][idx_B,:] - single_X[s:t, 1, :][idx_B,:], axis=0) )
-                bin_single_base_nb_mean.append( np.sum(single_base_nb_mean[s:t, :], axis=0) )
-                bin_single_total_bb_RD.append( np.sum(single_total_bb_RD[s:t, :], axis=0) )
-                bin_sorted_chr_pos_first.append( sorted_chr_pos[s] )
-                bin_sorted_chr_pos_last.append( sorted_chr_pos_last[t-1] )
-                bin_x_gene_list.append( " ".join(this_genes) )
-                bin_n_snps.append( np.sum(n_snps[s:t]) )
+                bin_single_X_rdr.append(np.sum(single_X[s:t, 0, :], axis=0))
+                bin_single_X_baf.append(
+                    np.sum(single_X[s:t, 1, :][idx_A, :], axis=0)
+                    + np.sum(
+                        single_total_bb_RD[s:t, :][idx_B, :]
+                        - single_X[s:t, 1, :][idx_B, :],
+                        axis=0,
+                    )
+                )
+                bin_single_base_nb_mean.append(
+                    np.sum(single_base_nb_mean[s:t, :], axis=0)
+                )
+                bin_single_total_bb_RD.append(
+                    np.sum(single_total_bb_RD[s:t, :], axis=0)
+                )
+                bin_sorted_chr_pos_first.append(sorted_chr_pos[s])
+                bin_sorted_chr_pos_last.append(sorted_chr_pos_last[t - 1])
+                bin_x_gene_list.append(" ".join(this_genes))
+                bin_n_snps.append(np.sum(n_snps[s:t]))
             s = t
         cumlen += le
-    single_X = np.stack([ np.vstack([bin_single_X_rdr[i], bin_single_X_baf[i]]) for i in range(len(bin_single_X_rdr)) ])
+    single_X = np.stack(
+        [
+            np.vstack([bin_single_X_rdr[i], bin_single_X_baf[i]])
+            for i in range(len(bin_single_X_rdr))
+        ]
+    )
     single_base_nb_mean = np.vstack(bin_single_base_nb_mean)
     single_total_bb_RD = np.vstack(bin_single_total_bb_RD)
     sorted_chr_pos_first = bin_sorted_chr_pos_first
@@ -787,44 +1239,66 @@ def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_
     n_snps = bin_n_snps
 
     # phase switch probability from genetic distance
-    tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair]
+    tmp_sorted_chr_pos = [
+        val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair
+    ]
     sorted_chr = np.array([x[0] for x in tmp_sorted_chr_pos])
-    position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file )
-    phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu)
+    position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file)
+    phase_switch_prob = compute_phase_switch_probability_position(
+        position_cM, tmp_sorted_chr_pos, nu
+    )
     log_sitewise_transmat = np.log(phase_switch_prob) - logphase_shift
     # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)]
-    log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)]
+    log_sitewise_transmat = log_sitewise_transmat[
+        np.arange(1, len(log_sitewise_transmat), 2)
+    ]
 
     sorted_chr = np.array([x[0] for x in sorted_chr_pos_first])
     unique_chrs = [sorted_chr[0]]
     for x in sorted_chr[1:]:
         if x != unique_chrs[-1]:
-            unique_chrs.append( x )
-    lengths = np.array([ np.sum(sorted_chr == chrname) for chrname in unique_chrs ])
-    
+            unique_chrs.append(x)
+    lengths = np.array([np.sum(sorted_chr == chrname) for chrname in unique_chrs])
+
     # bin RDR
     s = 0
     while s < single_X.shape[0]:
-        t = s+1
-        this_genes = sum([ x_gene_list[i].split(" ") for i in range(s,t) ], [])
-        this_genes = [z for z in this_genes if z!=""]
+        t = s + 1
+        this_genes = sum([x_gene_list[i].split(" ") for i in range(s, t)], [])
+        this_genes = [z for z in this_genes if z != ""]
         while t < single_X.shape[0] and len(this_genes) < rdrbinsize:
             t += 1
-            this_genes += x_gene_list[t-1].split(" ")
-            this_genes = [z for z in this_genes if z!=""]
+            this_genes += x_gene_list[t - 1].split(" ")
+            this_genes = [z for z in this_genes if z != ""]
         single_X[s, 0, :] = np.sum(single_X[s:t, 0, :], axis=0)
-        single_X[(s+1):t, 0, :] = 0
+        single_X[(s + 1) : t, 0, :] = 0
         single_base_nb_mean[s, :] = np.sum(single_base_nb_mean[s:t, :], axis=0)
-        single_base_nb_mean[(s+1):t, :] = 0
+        single_base_nb_mean[(s + 1) : t, :] = 0
         x_gene_list[s] = " ".join(this_genes)
-        for k in range(s+1,t):
+        for k in range(s + 1, t):
             x_gene_list[k] = ""
         s = t
 
-    return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos_first, sorted_chr_pos_last, x_gene_list, n_snps
-
-
-def create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, secondary_min_umi, max_binlength=5e6):
+    return (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+        sorted_chr_pos_first,
+        sorted_chr_pos_last,
+        x_gene_list,
+        n_snps,
+    )
+
+
+def create_bin_ranges(
+    df_gene_snp,
+    single_total_bb_RD,
+    refined_lengths,
+    secondary_min_umi,
+    max_binlength=5e6,
+):
     """
     Aggregate haplotype blocks to bins
 
@@ -844,7 +1318,10 @@ def create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, secondar
     df_gene_snp : data frame, (CHR, START, END, snp_id, gene, is_interval, block_id, bin_id)
         The newly added bin_id column indicates which bin each gene or SNP belongs to.
     """
-    def greedy_binning_nobreak(block_lengths, block_umi, secondary_min_umi, max_binlength):
+
+    def greedy_binning_nobreak(
+        block_lengths, block_umi, secondary_min_umi, max_binlength
+    ):
         """
         Returns
         -------
@@ -859,54 +1336,80 @@ def greedy_binning_nobreak(block_lengths, block_umi, secondary_min_umi, max_binl
             while t < len(block_lengths) and np.sum(block_umi[s:t]) < secondary_min_umi:
                 t += 1
                 if np.sum(block_lengths[s:t]) >= max_binlength:
-                    t = max(t-1, s+1)
+                    t = max(t - 1, s + 1)
                     break
             # check whether it is a very small bin in the end
-            if s > 0 and t == len(block_lengths) and np.sum(block_umi[s:t]) < 0.5*secondary_min_umi and np.sum(block_lengths[s:t]) < 0.5*max_binlength:
+            if (
+                s > 0
+                and t == len(block_lengths)
+                and np.sum(block_umi[s:t]) < 0.5 * secondary_min_umi
+                and np.sum(block_lengths[s:t]) < 0.5 * max_binlength
+            ):
                 bin_ranges[-1][1] = t
             else:
-                bin_ranges.append( [s,t] )
+                bin_ranges.append([s, t])
             s = t
         bin_ids = np.zeros(len(block_lengths), dtype=int)
-        for i,x in enumerate(bin_ranges):
-            bin_ids[x[0]:x[1]] = i
+        for i, x in enumerate(bin_ranges):
+            bin_ids[x[0] : x[1]] = i
         return bin_ids
-    
+
     # block lengths and block umis
-    sorted_chr_pos_both = df_gene_snp.groupby('block_id').agg({'CHR': 'first', 'START': 'first', 'END': 'last'})
+    sorted_chr_pos_both = df_gene_snp.groupby("block_id").agg(
+        {"CHR": "first", "START": "first", "END": "last"}
+    )
     block_lengths = sorted_chr_pos_both.END.values - sorted_chr_pos_both.START.values
     block_umi = np.sum(single_total_bb_RD, axis=1)
     n_blocks = len(block_lengths)
-    
+
     # get a list of breakpoints where bin much break
-    breakpoints = np.concatenate([ np.cumsum(refined_lengths), np.where(block_lengths > max_binlength)[0], np.where(block_lengths > max_binlength)[0]+1 ])
-    breakpoints =np.sort(np.unique(breakpoints))
+    breakpoints = np.concatenate(
+        [
+            np.cumsum(refined_lengths),
+            np.where(block_lengths > max_binlength)[0],
+            np.where(block_lengths > max_binlength)[0] + 1,
+        ]
+    )
+    breakpoints = np.sort(np.unique(breakpoints))
     # append 0 in the front of breakpoints so that each pair of adjacent breakpoints can be an input to greedy_binning_nobreak
     if breakpoints[0] != 0:
-        breakpoints = np.append( [0], breakpoints )
+        breakpoints = np.append([0], breakpoints)
     assert np.all(breakpoints[:-1] < breakpoints[1:])
 
     # loop over breakpoints and bin each block
     bin_ids = np.zeros(n_blocks, dtype=int)
     offset = 0
-    for i in range(len(breakpoints)-1):
+    for i in range(len(breakpoints) - 1):
         b1 = breakpoints[i]
-        b2 = breakpoints[i+1]
+        b2 = breakpoints[i + 1]
         if b2 - b1 == 1:
             bin_ids[b1:b2] = offset
             offset += 1
         else:
-            this_bin_ids = greedy_binning_nobreak(block_lengths[b1:b2], block_umi[b1:b2], secondary_min_umi, max_binlength)
+            this_bin_ids = greedy_binning_nobreak(
+                block_lengths[b1:b2], block_umi[b1:b2], secondary_min_umi, max_binlength
+            )
             bin_ids[b1:b2] = offset + this_bin_ids
             offset += np.max(this_bin_ids) + 1
-    
+
     # append bin_ids to df_gene_snp
-    df_gene_snp["bin_id"] = df_gene_snp.block_id.map({i:x for i,x in enumerate(bin_ids)})
-    
+    df_gene_snp["bin_id"] = df_gene_snp.block_id.map(
+        {i: x for i, x in enumerate(bin_ids)}
+    )
+
     return df_gene_snp
 
 
-def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, phase_indicator, nu, logphase_shift, geneticmap_file):
+def summarize_counts_for_bins(
+    df_gene_snp,
+    adata,
+    single_X,
+    single_total_bb_RD,
+    phase_indicator,
+    nu,
+    logphase_shift,
+    geneticmap_file,
+):
     """
     Attributes:
     ----------
@@ -923,7 +1426,7 @@ def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD,
 
     single_base_nb_mean : array, (n_blocks, n_spots)
         Baseline transcript counts in normal diploid per block per cell.
-    
+
     single_total_bb_RD : array, (n_blocks, n_spots)
         Total allele count per block per cell.
 
@@ -935,62 +1438,127 @@ def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD,
     bin_single_base_nb_mean = np.zeros((len(bins), adata.shape[0]))
     bin_single_total_bb_RD = np.zeros((len(bins), adata.shape[0]), dtype=int)
     # summarize counts of involved genes and SNPs within each block
-    df_bin_contents = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"block_id":set, "gene":set})
+    df_bin_contents = (
+        df_gene_snp[~df_gene_snp.bin_id.isnull()]
+        .groupby("bin_id")
+        .agg({"block_id": set, "gene": set})
+    )
     for b in range(df_bin_contents.shape[0]):
         # BAF (SNPs)
-        involved_blocks = [x for x in df_bin_contents.block_id.values[b] if not x is None]
-        this_phased = np.where(phase_indicator[involved_blocks].reshape(-1,1), single_X[involved_blocks, 1, :], single_total_bb_RD[involved_blocks, :] - single_X[involved_blocks, 1, :])
+        involved_blocks = [
+            x for x in df_bin_contents.block_id.values[b] if not x is None
+        ]
+        this_phased = np.where(
+            phase_indicator[involved_blocks].reshape(-1, 1),
+            single_X[involved_blocks, 1, :],
+            single_total_bb_RD[involved_blocks, :] - single_X[involved_blocks, 1, :],
+        )
         bin_single_X[b, 1, :] = np.sum(this_phased, axis=0)
-        bin_single_total_bb_RD[b, :] = np.sum( single_total_bb_RD[involved_blocks, :], axis=0 )
+        bin_single_total_bb_RD[b, :] = np.sum(
+            single_total_bb_RD[involved_blocks, :], axis=0
+        )
         # RDR (genes)
         involved_genes = [x for x in df_bin_contents.gene.values[b] if not x is None]
-        bin_single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 )
+        bin_single_X[b, 0, :] = np.sum(
+            adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1
+        )
 
     # lengths
     lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int)
-    for i,c in enumerate( df_gene_snp.CHR.unique() ):
-        lengths[i] = len( df_gene_snp[ (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull()) ].bin_id.unique() )
+    for i, c in enumerate(df_gene_snp.CHR.unique()):
+        lengths[i] = len(
+            df_gene_snp[
+                (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull())
+            ].bin_id.unique()
+        )
 
     # phase switch probability from genetic distance
-    sorted_chr_pos_first = df_gene_snp.groupby('bin_id').agg({'CHR': 'first', 'START': 'first'})
-    sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values))
-    sorted_chr_pos_last = df_gene_snp.groupby('bin_id').agg({'CHR': 'last', 'END': 'last'})
-    sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values))
+    sorted_chr_pos_first = df_gene_snp.groupby("bin_id").agg(
+        {"CHR": "first", "START": "first"}
+    )
+    sorted_chr_pos_first = list(
+        zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)
+    )
+    sorted_chr_pos_last = df_gene_snp.groupby("bin_id").agg(
+        {"CHR": "last", "END": "last"}
+    )
+    sorted_chr_pos_last = list(
+        zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)
+    )
     #
-    tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair]
-    position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file )
-    phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu)
-    log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift)
+    tmp_sorted_chr_pos = [
+        val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair
+    ]
+    position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file)
+    phase_switch_prob = compute_phase_switch_probability_position(
+        position_cM, tmp_sorted_chr_pos, nu
+    )
+    log_sitewise_transmat = np.minimum(
+        np.log(0.5), np.log(phase_switch_prob) - logphase_shift
+    )
     # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)]
-    log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)]
-
-    return lengths, bin_single_X, bin_single_base_nb_mean, bin_single_total_bb_RD, log_sitewise_transmat
-
-
-def bin_selection_basedon_normal(df_gene_snp, single_X, single_base_nb_mean, single_total_bb_RD, nu, logphase_shift, index_normal, geneticmap_file, confidence_interval=[0.05, 0.95], min_betabinom_tau=30):
+    log_sitewise_transmat = log_sitewise_transmat[
+        np.arange(1, len(log_sitewise_transmat), 2)
+    ]
+
+    return (
+        lengths,
+        bin_single_X,
+        bin_single_base_nb_mean,
+        bin_single_total_bb_RD,
+        log_sitewise_transmat,
+    )
+
+
+def bin_selection_basedon_normal(
+    df_gene_snp,
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    nu,
+    logphase_shift,
+    index_normal,
+    geneticmap_file,
+    confidence_interval=[0.05, 0.95],
+    min_betabinom_tau=30,
+):
     """
     Filter out bins that potential contain somatic mutations based on BAF of normal spots.
     """
     # pool B allele counts for each bin across all normal spots
     tmpX = np.sum(single_X[:, 1, index_normal], axis=1)
     tmptotal_bb_RD = np.sum(single_total_bb_RD[:, index_normal], axis=1)
-    model = Weighted_BetaBinom(tmpX, np.ones(len(tmpX)), weights=np.ones(len(tmpX)), exposure=tmptotal_bb_RD)
+    model = Weighted_BetaBinom(
+        tmpX, np.ones(len(tmpX)), weights=np.ones(len(tmpX)), exposure=tmptotal_bb_RD
+    )
     tmpres = model.fit(disp=0)
     tmpres.params[0] = 0.5
     tmpres.params[-1] = max(tmpres.params[-1], min_betabinom_tau)
     # remove bins if normal B allele frequencies fall out of 5%-95% probability range
-    removal_indicator1 = (tmpX < scipy.stats.betabinom.ppf(confidence_interval[0], tmptotal_bb_RD, tmpres.params[0] * tmpres.params[1], (1-tmpres.params[0]) * tmpres.params[1]))
-    removal_indicator2 = (tmpX > scipy.stats.betabinom.ppf(confidence_interval[1], tmptotal_bb_RD, tmpres.params[0] * tmpres.params[1], (1-tmpres.params[0]) * tmpres.params[1]))
-    print( np.sum(removal_indicator1 | removal_indicator2) )
+    removal_indicator1 = tmpX < scipy.stats.betabinom.ppf(
+        confidence_interval[0],
+        tmptotal_bb_RD,
+        tmpres.params[0] * tmpres.params[1],
+        (1 - tmpres.params[0]) * tmpres.params[1],
+    )
+    removal_indicator2 = tmpX > scipy.stats.betabinom.ppf(
+        confidence_interval[1],
+        tmptotal_bb_RD,
+        tmpres.params[0] * tmpres.params[1],
+        (1 - tmpres.params[0]) * tmpres.params[1],
+    )
+    print(np.sum(removal_indicator1 | removal_indicator2))
     index_removal = np.where(removal_indicator1 | removal_indicator2)[0]
     index_remaining = np.where(~(removal_indicator1 | removal_indicator2))[0]
     #
     # change df_gene_snp
     col = np.where(df_gene_snp.columns == "bin_id")[0][0]
-    df_gene_snp.iloc[ np.where(df_gene_snp.bin_id.isin(index_removal))[0], col] = None
+    df_gene_snp.iloc[np.where(df_gene_snp.bin_id.isin(index_removal))[0], col] = None
     # remap bin_id to existing list
-    df_gene_snp['bin_id'] = df_gene_snp['bin_id'].map({x:i for i,x in enumerate(index_remaining)})
-    df_gene_snp.bin_id = df_gene_snp.bin_id.astype('Int64')
+    df_gene_snp["bin_id"] = df_gene_snp["bin_id"].map(
+        {x: i for i, x in enumerate(index_remaining)}
+    )
+    df_gene_snp.bin_id = df_gene_snp.bin_id.astype("Int64")
 
     # change the related data matrices
     single_X = single_X[index_remaining, :, :]
@@ -999,26 +1567,61 @@ def bin_selection_basedon_normal(df_gene_snp, single_X, single_base_nb_mean, sin
 
     # lengths
     lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int)
-    for i,c in enumerate( df_gene_snp.CHR.unique() ):
-        lengths[i] = len( df_gene_snp[ (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull()) ].bin_id.unique() )
+    for i, c in enumerate(df_gene_snp.CHR.unique()):
+        lengths[i] = len(
+            df_gene_snp[
+                (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull())
+            ].bin_id.unique()
+        )
 
     ## phase switch probability from genetic distance
-    sorted_chr_pos_first = df_gene_snp.groupby('bin_id').agg({'CHR': 'first', 'START': 'first'})
-    sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values))
-    sorted_chr_pos_last = df_gene_snp.groupby('bin_id').agg({'CHR': 'last', 'END': 'last'})
-    sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values))
+    sorted_chr_pos_first = df_gene_snp.groupby("bin_id").agg(
+        {"CHR": "first", "START": "first"}
+    )
+    sorted_chr_pos_first = list(
+        zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)
+    )
+    sorted_chr_pos_last = df_gene_snp.groupby("bin_id").agg(
+        {"CHR": "last", "END": "last"}
+    )
+    sorted_chr_pos_last = list(
+        zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)
+    )
     #
-    tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair]
-    position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file )
-    phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu)
-    log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift)
+    tmp_sorted_chr_pos = [
+        val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair
+    ]
+    position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file)
+    phase_switch_prob = compute_phase_switch_probability_position(
+        position_cM, tmp_sorted_chr_pos, nu
+    )
+    log_sitewise_transmat = np.minimum(
+        np.log(0.5), np.log(phase_switch_prob) - logphase_shift
+    )
     # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)]
-    log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)]
-
-    return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp
-
-
-def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold=4, quantile_threshold=80):
+    log_sitewise_transmat = log_sitewise_transmat[
+        np.arange(1, len(log_sitewise_transmat), 2)
+    ]
+
+    return (
+        lengths,
+        single_X,
+        single_base_nb_mean,
+        single_total_bb_RD,
+        log_sitewise_transmat,
+        df_gene_snp,
+    )
+
+
+def filter_de_genes(
+    exp_counts,
+    x_gene_list,
+    normal_candidate,
+    sample_list=None,
+    sample_ids=None,
+    logfcthreshold=4,
+    quantile_threshold=80,
+):
     adata = anndata.AnnData(exp_counts)
     adata.layers["count"] = exp_counts.values
     adata.obs["normal_candidate"] = normal_candidate
@@ -1026,7 +1629,7 @@ def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None,
     map_gene_adatavar = {}
     map_gene_umi = {}
     list_gene_umi = np.sum(adata.layers["count"], axis=0)
-    for i,x in enumerate(adata.var.index):
+    for i, x in enumerate(adata.var.index):
         map_gene_adatavar[x] = i
         map_gene_umi[x] = list_gene_umi[i]
     #
@@ -1034,18 +1637,20 @@ def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None,
         sample_list = [None]
     #
     filtered_out_set = set()
-    for s,sname in enumerate(sample_list):
+    for s, sname in enumerate(sample_list):
         if sname is None:
             index = np.arange(adata.shape[0])
         else:
             index = np.where(sample_ids == s)[0]
         tmpadata = adata[index, :].copy()
         #
-        umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold )
+        umi_threshold = np.percentile(
+            np.sum(tmpadata.layers["count"], axis=0), quantile_threshold
+        )
         #
         sc.pp.filter_cells(tmpadata, min_genes=200)
         sc.pp.filter_genes(tmpadata, min_cells=10)
-        med = np.median( np.sum(tmpadata.layers["count"], axis=1) )
+        med = np.median(np.sum(tmpadata.layers["count"], axis=1))
         # sc.pp.normalize_total(tmpadata, target_sum=1e4)
         sc.pp.normalize_total(tmpadata, target_sum=med)
         sc.pp.log1p(tmpadata)
@@ -1053,29 +1658,56 @@ def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None,
         sc.pp.pca(tmpadata, n_comps=4)
         kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"])
         kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"])
-        idx_kmeans_label = np.argmax(np.bincount( kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2 ))
+        idx_kmeans_label = np.argmax(
+            np.bincount(kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2)
+        )
         clone = np.array(["normal"] * tmpadata.shape[0])
-        clone[ (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "tumor"
+        clone[
+            (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"])
+        ] = "tumor"
         tmpadata.obs["clone"] = clone
         # end added
-        sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor"], reference="normal", method='wilcoxon')
-        genenames = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["names"] ])
-        logfc = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ])
-        geneumis = np.array([ map_gene_umi[x] for x in genenames])
-        this_filtered_out_set = set(list(genenames[ (np.abs(logfc) > logfcthreshold) & (geneumis > umi_threshold) ]))
+        sc.tl.rank_genes_groups(
+            tmpadata, "clone", groups=["tumor"], reference="normal", method="wilcoxon"
+        )
+        genenames = np.array([x[0] for x in tmpadata.uns["rank_genes_groups"]["names"]])
+        logfc = np.array(
+            [x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"]]
+        )
+        geneumis = np.array([map_gene_umi[x] for x in genenames])
+        this_filtered_out_set = set(
+            list(
+                genenames[(np.abs(logfc) > logfcthreshold) & (geneumis > umi_threshold)]
+            )
+        )
         filtered_out_set = filtered_out_set | this_filtered_out_set
         print(f"Filter out {len(filtered_out_set)} DE genes")
     #
     new_single_X_rdr = np.zeros((len(x_gene_list), adata.shape[0]))
-    for i,x in enumerate(x_gene_list):
+    for i, x in enumerate(x_gene_list):
         g_list = [z for z in x.split() if z != ""]
-        idx_genes = np.array([ map_gene_adatavar[g] for g in g_list if (not g in filtered_out_set) and (g in map_gene_adatavar)])
+        idx_genes = np.array(
+            [
+                map_gene_adatavar[g]
+                for g in g_list
+                if (not g in filtered_out_set) and (g in map_gene_adatavar)
+            ]
+        )
         if len(idx_genes) > 0:
             new_single_X_rdr[i, :] = np.sum(adata.layers["count"][:, idx_genes], axis=1)
     return new_single_X_rdr, filtered_out_set
 
 
-def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold_u=2, logfcthreshold_t=4, quantile_threshold=80):
+def filter_de_genes_tri(
+    exp_counts,
+    df_bininfo,
+    normal_candidate,
+    sample_list=None,
+    sample_ids=None,
+    logfcthreshold_u=2,
+    logfcthreshold_t=4,
+    quantile_threshold=80,
+):
     """
     Attributes
     ----------
@@ -1089,7 +1721,7 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No
     map_gene_adatavar = {}
     map_gene_umi = {}
     list_gene_umi = np.sum(adata.layers["count"], axis=0)
-    for i,x in enumerate(adata.var.index):
+    for i, x in enumerate(adata.var.index):
         map_gene_adatavar[x] = i
         map_gene_umi[x] = list_gene_umi[i]
     #
@@ -1097,20 +1729,25 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No
         sample_list = [None]
     #
     filtered_out_set = set()
-    for s,sname in enumerate(sample_list):
+    for s, sname in enumerate(sample_list):
         if sname is None:
             index = np.arange(adata.shape[0])
         else:
             index = np.where(sample_ids == s)[0]
         tmpadata = adata[index, :].copy()
-        if np.sum(tmpadata.layers["count"][tmpadata.obs["normal_candidate"], :]) < tmpadata.shape[1] * 10:
+        if (
+            np.sum(tmpadata.layers["count"][tmpadata.obs["normal_candidate"], :])
+            < tmpadata.shape[1] * 10
+        ):
             continue
         #
-        umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold )
+        umi_threshold = np.percentile(
+            np.sum(tmpadata.layers["count"], axis=0), quantile_threshold
+        )
         #
         # sc.pp.filter_cells(tmpadata, min_genes=200)
         sc.pp.filter_genes(tmpadata, min_cells=10)
-        med = np.median( np.sum(tmpadata.layers["count"], axis=1) )
+        med = np.median(np.sum(tmpadata.layers["count"], axis=1))
         # sc.pp.normalize_total(tmpadata, target_sum=1e4)
         sc.pp.normalize_total(tmpadata, target_sum=med)
         sc.pp.log1p(tmpadata)
@@ -1118,11 +1755,17 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No
         sc.pp.pca(tmpadata, n_comps=4)
         kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"])
         kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"])
-        idx_kmeans_label = np.argmax(np.bincount( kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2 ))
+        idx_kmeans_label = np.argmax(
+            np.bincount(kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2)
+        )
         clone = np.array(["normal"] * tmpadata.shape[0])
-        clone[ (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "tumor"
+        clone[
+            (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"])
+        ] = "tumor"
         ### third part ###
-        clone[ (kmeans_labels == idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "unsure"
+        clone[
+            (kmeans_labels == idx_kmeans_label) & (~tmpadata.obs["normal_candidate"])
+        ] = "unsure"
         tmpadata.obs["clone"] = clone
         # end added
         # sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor", "unsure"], reference="normal", method='wilcoxon')
@@ -1136,21 +1779,48 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No
         # geneumis_u = np.array([ map_gene_umi[x] for x in genenames_u])
         # this_filtered_out_set = set(list(genenames_t[ (np.abs(logfc_t) > logfcthreshold) & (geneumis_t > umi_threshold) ])) | set(list(genenames_u[ (np.abs(logfc_u) > logfcthreshold) & (geneumis_u > umi_threshold) ]))
         #
-        agg_counts = np.vstack([ np.sum(tmpadata.layers["count"][tmpadata.obs['clone']==c,:], axis=0) for c in ['normal', 'unsure', 'tumor'] ])
+        agg_counts = np.vstack(
+            [
+                np.sum(tmpadata.layers["count"][tmpadata.obs["clone"] == c, :], axis=0)
+                for c in ["normal", "unsure", "tumor"]
+            ]
+        )
         agg_counts = agg_counts / np.sum(agg_counts, axis=1, keepdims=True) * 1e6
-        geneumis = np.array([ map_gene_umi[x] for x in tmpadata.var.index])
-        logfc_u = np.where( ((agg_counts[1,:]==0) | (agg_counts[0,:]==0)), 10, np.log2(agg_counts[1,:] / agg_counts[0,:]) )
-        logfc_t = np.where( ((agg_counts[2,:]==0) | (agg_counts[0,:]==0)), 10, np.log2(agg_counts[2,:] / agg_counts[0,:]) )
-        this_filtered_out_set = set(list(tmpadata.var.index[ (np.abs(logfc_u)>logfcthreshold_u) & (geneumis>umi_threshold) ])) | set(list(tmpadata.var.index[ (np.abs(logfc_t)>logfcthreshold_t) & (geneumis>umi_threshold) ]))
+        geneumis = np.array([map_gene_umi[x] for x in tmpadata.var.index])
+        logfc_u = np.where(
+            ((agg_counts[1, :] == 0) | (agg_counts[0, :] == 0)),
+            10,
+            np.log2(agg_counts[1, :] / agg_counts[0, :]),
+        )
+        logfc_t = np.where(
+            ((agg_counts[2, :] == 0) | (agg_counts[0, :] == 0)),
+            10,
+            np.log2(agg_counts[2, :] / agg_counts[0, :]),
+        )
+        this_filtered_out_set = set(
+            list(
+                tmpadata.var.index[
+                    (np.abs(logfc_u) > logfcthreshold_u) & (geneumis > umi_threshold)
+                ]
+            )
+        ) | set(
+            list(
+                tmpadata.var.index[
+                    (np.abs(logfc_t) > logfcthreshold_t) & (geneumis > umi_threshold)
+                ]
+            )
+        )
         filtered_out_set = filtered_out_set | this_filtered_out_set
         print(f"Filter out {len(filtered_out_set)} DE genes")
     #
     # remove genes that are in filtered_out_set
     new_single_X_rdr = np.zeros((df_bininfo.shape[0], adata.shape[0]))
-    for b,genestr in enumerate(df_bininfo.INCLUDED_GENES.values):
+    for b, genestr in enumerate(df_bininfo.INCLUDED_GENES.values):
         # RDR (genes)
         involved_genes = set(genestr.split(" ")) - filtered_out_set
-        new_single_X_rdr[b, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 )
+        new_single_X_rdr[b, :] = np.sum(
+            adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1
+        )
 
     return new_single_X_rdr, filtered_out_set
 
@@ -1161,15 +1831,48 @@ def get_lengths_by_arm(sorted_chr_pos, centromere_file):
     """
     # read and process centromere file
     unique_chrs = [f"chr{i}" for i in range(1, 23)]
-    df = pd.read_csv(centromere_file, sep="\t", header=None, index_col=None, names=["CHRNAME", "START", "END", "LABEL", "SOURCE"])
+    df = pd.read_csv(
+        centromere_file,
+        sep="\t",
+        header=None,
+        index_col=None,
+        names=["CHRNAME", "START", "END", "LABEL", "SOURCE"],
+    )
     df = df[df.CHRNAME.isin(unique_chrs)]
     df["CHR"] = [int(x[3:]) for x in df.CHRNAME]
-    df = df.groupby("CHR").agg({"CHRNAME":"first", "START":"min", "END":"min", "LABEL":"first", "SOURCE":"first"})
+    df = df.groupby("CHR").agg(
+        {
+            "CHRNAME": "first",
+            "START": "min",
+            "END": "min",
+            "LABEL": "first",
+            "SOURCE": "first",
+        }
+    )
     df.sort_index(inplace=True)
     # count lengths
-    mat_chr_pos = np.vstack([ np.array([x[0] for x in sorted_chr_pos]), np.array([x[1] for x in sorted_chr_pos]) ]).T
-    armlengths = sum([ [np.sum((mat_chr_pos[:,0] == df.index[i]) & (mat_chr_pos[:,1] <= df.END.iloc[i])), \
-                        np.sum((mat_chr_pos[:,0] == df.index[i]) & (mat_chr_pos[:,1] > df.END.iloc[i]))] for i in range(df.shape[0])], [])
+    mat_chr_pos = np.vstack(
+        [
+            np.array([x[0] for x in sorted_chr_pos]),
+            np.array([x[1] for x in sorted_chr_pos]),
+        ]
+    ).T
+    armlengths = sum(
+        [
+            [
+                np.sum(
+                    (mat_chr_pos[:, 0] == df.index[i])
+                    & (mat_chr_pos[:, 1] <= df.END.iloc[i])
+                ),
+                np.sum(
+                    (mat_chr_pos[:, 0] == df.index[i])
+                    & (mat_chr_pos[:, 1] > df.END.iloc[i])
+                ),
+            ]
+            for i in range(df.shape[0])
+        ],
+        [],
+    )
     armlengths = np.array(armlengths, dtype=int)
     return armlengths
 
@@ -1190,12 +1893,20 @@ def get_lengths_by_arm(sorted_chr_pos, centromere_file):
 
 def expand_df_cnv(df_cnv, binsize=2e5, fillmissing=True):
     # get CHR and its END
-    df_chr_end = df_cnv.groupby("CHR").agg({"END":"max"}).reset_index()
+    df_chr_end = df_cnv.groupby("CHR").agg({"END": "max"}).reset_index()
 
     # initialize df_expand as a dataframe containing CHR, START, END such that END-START = binsize
     df_expand = []
-    for i,c in enumerate(df_chr_end.CHR.values):
-        df_expand.append( pd.DataFrame({"CHR":c, "START":np.arange(0, df_chr_end.END.values[i], binsize), "END":binsize + np.arange(0, df_chr_end.END.values[i], binsize)}) )
+    for i, c in enumerate(df_chr_end.CHR.values):
+        df_expand.append(
+            pd.DataFrame(
+                {
+                    "CHR": c,
+                    "START": np.arange(0, df_chr_end.END.values[i], binsize),
+                    "END": binsize + np.arange(0, df_chr_end.END.values[i], binsize),
+                }
+            )
+        )
     df_expand = pd.concat(df_expand, ignore_index=True)
 
     # find the index in df_cnv such that each entry in df_expand overlaps with the largest length
@@ -1208,29 +1919,39 @@ def expand_df_cnv(df_cnv, binsize=2e5, fillmissing=True):
     for i, this_chr in enumerate(df_expand.CHR.values):
         this_start = df_expand.START.values[i]
         this_end = df_expand.END.values[i]
-        while j < df_cnv.shape[0] and (vec_cnv_chr[j] < this_chr or (vec_cnv_chr[j] == this_chr and vec_cnv_end[j] <= this_start)):
+        while j < df_cnv.shape[0] and (
+            vec_cnv_chr[j] < this_chr
+            or (vec_cnv_chr[j] == this_chr and vec_cnv_end[j] <= this_start)
+        ):
             j += 1
         # overlap length of the j-th segment to (j+3)-th segment in df_cnv
         overlap_lengths = []
-        for k in range(j, min(j+3, df_cnv.shape[0])):
+        for k in range(j, min(j + 3, df_cnv.shape[0])):
             if vec_cnv_chr[k] > this_chr or vec_cnv_start[k] > this_end:
                 break
-            overlap_lengths.append( min(vec_cnv_end[k], this_end) - max(vec_cnv_start[k], this_start) )
+            overlap_lengths.append(
+                min(vec_cnv_end[k], this_end) - max(vec_cnv_start[k], this_start)
+            )
         if len(overlap_lengths) > 0:
             seg_index[i] = j + np.argmax(overlap_lengths)
 
     for col in df_cnv.columns[df_cnv.columns.str.startswith("clone")]:
         df_expand[col] = np.nan
-        df_expand[col].iloc[seg_index>=0] = df_cnv[col].values[ seg_index[seg_index>=0] ]
+        df_expand[col].iloc[seg_index >= 0] = df_cnv[col].values[
+            seg_index[seg_index >= 0]
+        ]
         df_expand[col] = df_expand[col].astype("Int64")
 
     if fillmissing:
         # for each nan row, fill it with the closest non-nan row
-        nan_rows = np.where( df_expand.iloc[:,-1].isnull() )[0]
-        filled_rows = np.where( ~df_expand.iloc[:,-1].isnull() )[0]
+        nan_rows = np.where(df_expand.iloc[:, -1].isnull())[0]
+        filled_rows = np.where(~df_expand.iloc[:, -1].isnull())[0]
         for i in nan_rows:
-            candidates = np.where( (~df_expand.iloc[:,-1].isnull()) & (df_expand.CHR.values == df_expand.CHR.values[i]) )[0]
-            j = candidates[ np.argmin(np.abs(candidates - i)) ]
+            candidates = np.where(
+                (~df_expand.iloc[:, -1].isnull())
+                & (df_expand.CHR.values == df_expand.CHR.values[i])
+            )[0]
+            j = candidates[np.argmin(np.abs(candidates - i))]
             df_expand.iloc[i, 3:] = df_expand.iloc[j, 3:].values
 
     return df_expand
@@ -1241,56 +1962,108 @@ def summary_events(cnv_segfile, rescombinefile, minlength=10):
     # read rescombine file
     res_combine = dict(np.load(rescombinefile, allow_pickle=True))
     pred_cnv = res_combine["pred_cnv"]
-    logrdr_profile = np.vstack([ res_combine["new_log_mu"][pred_cnv[:,c], c] for c in range(pred_cnv.shape[1]) ])
-    baf_profile = np.vstack([ res_combine["new_p_binom"][pred_cnv[:,c], c] for c in range(pred_cnv.shape[1]) ])
+    logrdr_profile = np.vstack(
+        [res_combine["new_log_mu"][pred_cnv[:, c], c] for c in range(pred_cnv.shape[1])]
+    )
+    baf_profile = np.vstack(
+        [
+            res_combine["new_p_binom"][pred_cnv[:, c], c]
+            for c in range(pred_cnv.shape[1])
+        ]
+    )
 
     # read CNV file
-    df_cnv = pd.read_csv(cnv_segfile, header=0, sep='\t')
+    df_cnv = pd.read_csv(cnv_segfile, header=0, sep="\t")
     # get clone names
-    calico_clones = np.array([ x.split(" ")[0][5:] for x in df_cnv.columns if x.endswith(" A") ])
+    calico_clones = np.array(
+        [x.split(" ")[0][5:] for x in df_cnv.columns if x.endswith(" A")]
+    )
     # retain only the clones that are not entirely diploid
-    calico_clones = [c for c in calico_clones if np.sum(np.abs(baf_profile[int(c),:] - 0.5) > EPS_BAF) > minlength ]
+    calico_clones = [
+        c
+        for c in calico_clones
+        if np.sum(np.abs(baf_profile[int(c), :] - 0.5) > EPS_BAF) > minlength
+    ]
     # label CNV states per bin per clone into "neu", "del", "amp", "loh" states
     for c in calico_clones:
-        counts = df_cnv.END.values-df_cnv.START.values
+        counts = df_cnv.END.values - df_cnv.START.values
         counts = np.maximum(1, counts / 1e4).astype(int)
-        tmp = strict_convert_copy_to_states(df_cnv[f"clone{c} A"].values, df_cnv[f"clone{c} B"].values, counts=counts)
+        tmp = strict_convert_copy_to_states(
+            df_cnv[f"clone{c} A"].values, df_cnv[f"clone{c} B"].values, counts=counts
+        )
         tmp[tmp == "bdel"] = "del"
         tmp[tmp == "bamp"] = "amp"
         df_cnv[f"srt_cnstate_clone{c}"] = tmp
 
     # partition the genome into segments such that the allele-specific CN across all clones are the same within each segment
-    segments, labs = get_intervals_nd(df_cnv[["CHR"] + [ f"clone{x} A" for x in calico_clones ] + [ f"clone{x} B" for x in calico_clones ]].values)
+    segments, labs = get_intervals_nd(
+        df_cnv[
+            ["CHR"]
+            + [f"clone{x} A" for x in calico_clones]
+            + [f"clone{x} B" for x in calico_clones]
+        ].values
+    )
     # collect event, that is labs and segments pair such that the cnstate is not normal
     events = []
     for i, seg in enumerate(segments):
         if seg[1] - seg[0] < minlength:
             continue
-        if np.all(df_cnv[[ f"srt_cnstate_clone{x}" for x in calico_clones ]].iloc[seg[0],:].values == "neu"):
+        if np.all(
+            df_cnv[[f"srt_cnstate_clone{x}" for x in calico_clones]]
+            .iloc[seg[0], :]
+            .values
+            == "neu"
+        ):
             continue
-        acn_list = [ (df_cnv[f"srt_cnstate_clone{c}"].values[seg[0]], df_cnv[f"clone{c} A"].values[seg[0]], df_cnv[f"clone{c} B"].values[seg[0]]) for c in calico_clones ]
+        acn_list = [
+            (
+                df_cnv[f"srt_cnstate_clone{c}"].values[seg[0]],
+                df_cnv[f"clone{c} A"].values[seg[0]],
+                df_cnv[f"clone{c} B"].values[seg[0]],
+            )
+            for c in calico_clones
+        ]
         acn_set = set(acn_list)
         for e in acn_set:
             if e[0] == "neu":
                 continue
-            involved_clones = [calico_clones[i] for i in range(len(calico_clones)) if acn_list[i] == e]
-            events.append( pd.DataFrame({"CHR":df_cnv.CHR.values[seg[0]], "START":df_cnv.START.values[seg[0]], "END":df_cnv.END.values[seg[1]-1], "BinSTART":seg[0], "BinEND":seg[1]-1,\
-                                         "CN":f"{e[1]}|{e[2]}", "Label":e[0], "involved_clones":",".join(involved_clones)}, index=[0]) )
+            involved_clones = [
+                calico_clones[i] for i in range(len(calico_clones)) if acn_list[i] == e
+            ]
+            events.append(
+                pd.DataFrame(
+                    {
+                        "CHR": df_cnv.CHR.values[seg[0]],
+                        "START": df_cnv.START.values[seg[0]],
+                        "END": df_cnv.END.values[seg[1] - 1],
+                        "BinSTART": seg[0],
+                        "BinEND": seg[1] - 1,
+                        "CN": f"{e[1]}|{e[2]}",
+                        "Label": e[0],
+                        "involved_clones": ",".join(involved_clones),
+                    },
+                    index=[0],
+                )
+            )
     df_events = pd.concat(events, ignore_index=True)
-    
+
     # merge adjacent events if they have the same involved_clones and same CN
     unique_ic = np.unique(df_events.involved_clones.values)
     concise_events = []
-    for ic in unique_ic:    
+    for ic in unique_ic:
         tmpdf = df_events[df_events.involved_clones == ic]
         # merge adjacent rows in tmpdf if they have the same CN END of the previous row is the same as the START of the next row
-        concise_events.append( tmpdf.iloc[0:1,:] )
+        concise_events.append(tmpdf.iloc[0:1, :])
         for i in range(1, tmpdf.shape[0]):
-            if tmpdf.CN.values[i] == concise_events[-1].CN.values[0] and tmpdf.CHR.values[i] == concise_events[-1].CHR.values[0] and tmpdf.START.values[i] == concise_events[-1].END.values[0]:
+            if (
+                tmpdf.CN.values[i] == concise_events[-1].CN.values[0]
+                and tmpdf.CHR.values[i] == concise_events[-1].CHR.values[0]
+                and tmpdf.START.values[i] == concise_events[-1].END.values[0]
+            ):
                 concise_events[-1].END.values[0] = tmpdf.END.values[i]
                 concise_events[-1].BinEND.values[0] = tmpdf.BinEND.values[i]
             else:
-                concise_events.append( tmpdf.iloc[i:(i+1),:] )
+                concise_events.append(tmpdf.iloc[i : (i + 1), :])
     df_concise_events = pd.concat(concise_events, ignore_index=True)
 
     # add the RDR abd BAF info
@@ -1299,29 +2072,60 @@ def summary_events(cnv_segfile, rescombinefile, minlength=10):
     rdr_diff = np.nan * np.ones(df_concise_events.shape[0])
     baf_diff = np.nan * np.ones(df_concise_events.shape[0])
     for i in range(df_concise_events.shape[0]):
-        involved_clones = np.array([int(c) for c in df_concise_events.involved_clones.values[i].split(",")])
+        involved_clones = np.array(
+            [int(c) for c in df_concise_events.involved_clones.values[i].split(",")]
+        )
         bs = df_concise_events.BinSTART.values[i]
         be = df_concise_events.BinEND.values[i]
         # rdr[i] = np.exp(np.mean(res_combine["new_log_mu"][ (pred_cnv[bs:be,:][:,involved_clones].flatten(), np.tile(involved_clones, be-bs)) ]))
         # baf[i] = np.mean(res_combine["new_p_binom"][ (pred_cnv[bs:be,:][:,involved_clones].flatten(), np.tile(involved_clones, be-bs)) ])
-        rdr[i] = np.exp(np.mean( np.concatenate([logrdr_profile[i, bs:be] for i in involved_clones ]) ))
-        baf[i] = np.mean( np.concatenate([baf_profile[i, bs:be] for i in involved_clones ]) )
+        rdr[i] = np.exp(
+            np.mean(np.concatenate([logrdr_profile[i, bs:be] for i in involved_clones]))
+        )
+        baf[i] = np.mean(
+            np.concatenate([baf_profile[i, bs:be] for i in involved_clones])
+        )
         # get the uninvolved clones
-        uninvolved_clones = np.array([int(c)for c in calico_clones if int(c) not in involved_clones])
+        uninvolved_clones = np.array(
+            [int(c) for c in calico_clones if int(c) not in involved_clones]
+        )
         if len(uninvolved_clones) > 0:
             # rdr_diff[i] = np.exp(np.mean(res_combine["new_log_mu"][ (pred_cnv[bs:be,:][:,uninvolved_clones].flatten(), np.tile(uninvolved_clones, be-bs)) ])) - rdr[i]
             # baf_diff[i] = np.mean(res_combine["new_p_binom"][ (pred_cnv[bs:be,:][:,uninvolved_clones].flatten(), np.tile(uninvolved_clones, be-bs)) ]) - baf[i]
-            rdr_diff[i] = rdr[i] - np.exp(np.mean( np.concatenate([logrdr_profile[i, bs:be] for i in uninvolved_clones ]) ))
-            baf_diff[i] = baf[i] - np.mean( np.concatenate([baf_profile[i, bs:be] for i in uninvolved_clones ]) )
+            rdr_diff[i] = rdr[i] - np.exp(
+                np.mean(
+                    np.concatenate(
+                        [logrdr_profile[i, bs:be] for i in uninvolved_clones]
+                    )
+                )
+            )
+            baf_diff[i] = baf[i] - np.mean(
+                np.concatenate([baf_profile[i, bs:be] for i in uninvolved_clones])
+            )
     df_concise_events["RDR"] = rdr
     df_concise_events["BAF"] = baf
     df_concise_events["RDR_diff"] = rdr_diff
     df_concise_events["BAF_diff"] = baf_diff
 
-    return df_concise_events[["CHR", "START", "END", "BinSTART", "BinEND", "RDR", "BAF", "RDR_diff", "BAF_diff", "CN", "Label", "involved_clones"]]
-
-
-def get_best_initialization(output_dir):  
+    return df_concise_events[
+        [
+            "CHR",
+            "START",
+            "END",
+            "BinSTART",
+            "BinEND",
+            "RDR",
+            "BAF",
+            "RDR_diff",
+            "BAF_diff",
+            "CN",
+            "Label",
+            "involved_clones",
+        ]
+    ]
+
+
+def get_best_initialization(output_dir):
     """
     find the best HMRF initialization random seed
     """
@@ -1331,7 +2135,12 @@ def get_best_initialization(output_dir):
     for file in rdrbaf_files:
         outdir = file.parent
         res_combine = dict(np.load(str(file)), allow_pickle=True)
-        df.append( pd.DataFrame({'outdir':str(outdir), "log-likelihood":res_combine["total_llf"]}, index=[0]) )
+        df.append(
+            pd.DataFrame(
+                {"outdir": str(outdir), "log-likelihood": res_combine["total_llf"]},
+                index=[0],
+            )
+        )
     df = pd.concat(df, ignore_index=True)
     idx = np.argmax(df["log-likelihood"])
     return df["outdir"].iloc[idx]
diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 7e3cbbb..6f6ec02 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -27,8 +27,8 @@ def convert_params(mean, std):
 
     See https://mathworld.wolfram.com/NegativeBinomialDistribution.html
     """
-    p = mean/std**2
-    n = mean*p/(1.0 - p)
+    p = mean / std**2
+    n = mean * p / (1.0 - p)
     return n, p
 
 
@@ -51,11 +51,13 @@ class Weighted_NegativeBinomial(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
+
     def __init__(self, endog, exog, weights, exposure, seed=0, **kwds):
         super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds)
         self.weights = weights
         self.exposure = exposure
         self.seed = seed
+
     #
     def nloglikeobs(self, params):
         nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
@@ -64,18 +66,19 @@ def nloglikeobs(self, params):
         llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
-        self.exog_names.append('alpha')
+        self.exog_names.append("alpha")
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
 
-        return super(Weighted_NegativeBinomial, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+        return super(Weighted_NegativeBinomial, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class Weighted_NegativeBinomial_mix(GenericLikelihoodModel):
@@ -85,25 +88,29 @@ def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds):
         self.exposure = exposure
         self.seed = seed
         self.tumor_prop = tumor_prop
+
     #
     def nloglikeobs(self, params):
-        nb_mean = self.exposure * (self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop)
+        nb_mean = self.exposure * (
+            self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop
+        )
         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
         n, p = convert_params(nb_mean, nb_std)
         llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
-        self.exog_names.append('alpha')
+        self.exog_names.append("alpha")
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
-        return super(Weighted_NegativeBinomial_mix, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+        return super(Weighted_NegativeBinomial_mix, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class Weighted_BetaBinom(GenericLikelihoodModel):
@@ -125,10 +132,12 @@ class Weighted_BetaBinom(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
     """
+
     def __init__(self, endog, exog, weights, exposure, **kwds):
         super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds)
         self.weights = weights
         self.exposure = exposure
+
     #
     def nloglikeobs(self, params):
         a = (self.exog @ params[:-1]) * params[-1]
@@ -136,17 +145,20 @@ def nloglikeobs(self, params):
         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         self.exog_names.append("tau")
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
-                start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
-        return super(Weighted_BetaBinom, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+                start_params = np.append(
+                    0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
+                )
+        return super(Weighted_BetaBinom, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class Weighted_BetaBinom_mix(GenericLikelihoodModel):
@@ -155,24 +167,33 @@ def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds):
         self.weights = weights
         self.exposure = exposure
         self.tumor_prop = tumor_prop
+
     #
     def nloglikeobs(self, params):
-        a = (self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1]
-        b = ((1 - self.exog @ params[:-1]) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1]
+        a = (
+            self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
+        ) * params[-1]
+        b = (
+            (1 - self.exog @ params[:-1]) * self.tumor_prop
+            + 0.5 * (1 - self.tumor_prop)
+        ) * params[-1]
         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         self.exog_names.append("tau")
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
-                start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
-        return super(Weighted_BetaBinom_mix, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+                start_params = np.append(
+                    0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
+                )
+        return super(Weighted_BetaBinom_mix, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
@@ -181,6 +202,7 @@ def __init__(self, endog, exog, tau, weights, exposure, **kwds):
         self.tau = tau
         self.weights = weights
         self.exposure = exposure
+
     #
     def nloglikeobs(self, params):
         a = (self.exog @ params) * self.tau
@@ -188,17 +210,18 @@ def nloglikeobs(self, params):
         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = 0.1 * np.ones(self.nparams)
-        
-        return super(Weighted_BetaBinom_fixdispersion, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+
+        return super(Weighted_BetaBinom_fixdispersion, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel):
@@ -208,24 +231,30 @@ def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds):
         self.weights = weights
         self.exposure = exposure
         self.tumor_prop = tumor_prop
+
     #
     def nloglikeobs(self, params):
-        a = (self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau
-        b = ((1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau
+        a = (
+            self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
+        ) * self.tau
+        b = (
+            (1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
+        ) * self.tau
         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = 0.1 * np.ones(self.nparams)
-        
-        return super(Weighted_BetaBinom_fixdispersion_mix, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
+
+        return super(Weighted_BetaBinom_fixdispersion_mix, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
 
 
 class BAF_Binom(GenericLikelihoodModel):
@@ -247,12 +276,14 @@ class BAF_Binom(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
     """
+
     def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds):
         super(BAF_Binom, self).__init__(endog, exog, **kwds)
         self.weights = weights
         self.exposure = exposure
         self.offset = offset
         self.scaling = scaling
+
     #
     def nloglikeobs(self, params):
         linear_term = self.exog @ params
@@ -260,13 +291,14 @@ def nloglikeobs(self, params):
         llf = scipy.stats.binom.logpmf(self.endog, self.exposure, p)
         neg_sum_llf = -llf.dot(self.weights)
         return neg_sum_llf
+
     #
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         if start_params is None:
-            if hasattr(self, 'start_params'):
+            if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
-                start_params = 0.5 / np.sum(self.exog.shape[1]) *  np.ones(self.nparams)
-        return super(BAF_Binom, self).fit(start_params=start_params,
-                                               maxiter=maxiter, maxfun=maxfun,
-                                               **kwds)
\ No newline at end of file
+                start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams)
+        return super(BAF_Binom, self).fit(
+            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        )
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 9c22aaf..2a22f4d 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -27,11 +27,11 @@ def np_max_ax_keep(arr, axis=0):
     assert arr.ndim == 2
     assert axis in [0, 1]
     if axis == 0:
-        result = np.zeros( (1, arr.shape[1]) )
+        result = np.zeros((1, arr.shape[1]))
         for i in range(result.shape[1]):
             result[:, i] = np.max(arr[:, i])
     else:
-        result = np.zeros( (arr.shape[0], 1) )
+        result = np.zeros((arr.shape[0], 1))
         for i in range(result.shape[0]):
             result[i, :] = np.max(arr[i, :])
     return result
@@ -57,11 +57,11 @@ def np_sum_ax_keep(arr, axis=0):
     assert arr.ndim == 2
     assert axis in [0, 1]
     if axis == 0:
-        result = np.zeros( (1, arr.shape[1]) )
+        result = np.zeros((1, arr.shape[1]))
         for i in range(result.shape[1]):
             result[:, i] = np.sum(arr[:, i])
     else:
-        result = np.zeros( (arr.shape[0], 1) )
+        result = np.zeros((arr.shape[0], 1))
         for i in range(result.shape[0]):
             result[i, :] = np.sum(arr[i, :])
     return result
@@ -81,26 +81,27 @@ def np_mean_ax_squeeze(arr, axis=0):
             result[i] = np.mean(arr[i, :])
     return result
 
+
 @njit
 def np_mean_ax_keep(arr, axis=0):
     assert arr.ndim == 2
     assert axis in [0, 1]
     if axis == 0:
-        result = np.zeros( (1, arr.shape[1]) )
+        result = np.zeros((1, arr.shape[1]))
         for i in range(result.shape[1]):
             result[:, i] = np.mean(arr[:, i])
     else:
-        result = np.zeros( (arr.shape[0], 1) )
+        result = np.zeros((arr.shape[0], 1))
         for i in range(result.shape[0]):
             result[i, :] = np.mean(arr[i, :])
     return result
 
 
-@njit 
+@njit
 def mylogsumexp(a):
     # get max
     a_max = np.max(a)
-    if (np.isinf(a_max)):
+    if np.isinf(a_max):
         return a_max
     # exponential
     tmp = np.exp(a - a_max)
@@ -110,7 +111,7 @@ def mylogsumexp(a):
     return s + a_max
 
 
-@njit 
+@njit
 def mylogsumexp_ax_keep(a, axis):
     # get max
     a_max = np_max_ax_keep(a, axis=axis)
@@ -132,7 +133,7 @@ def construct_unique_matrix(obs_count, total_count):
     ----------
     allele_count : array, shape (n_observations, n_spots)
         Observed A allele counts per SNP per spot.
-        
+
     total_bb_RD : array, shape (n_observations, n_spots)
         Total SNP-covering reads per SNP per spot.
     """
@@ -142,41 +143,69 @@ def construct_unique_matrix(obs_count, total_count):
     mapping_matrices = []
     for s in range(n_spots):
         if total_count.dtype == int:
-            pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T, axis=0 )
+            pairs = np.unique(np.vstack([obs_count[:, s], total_count[:, s]]).T, axis=0)
         else:
-            pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T.round(decimals=4), axis=0 )
-        unique_values.append( pairs )
-        pair_index = {(pairs[i,0], pairs[i,1]):i for i in range(pairs.shape[0])}
+            pairs = np.unique(
+                np.vstack([obs_count[:, s], total_count[:, s]]).T.round(decimals=4),
+                axis=0,
+            )
+        unique_values.append(pairs)
+        pair_index = {(pairs[i, 0], pairs[i, 1]): i for i in range(pairs.shape[0])}
         # construct mapping matrix
         mat_row = np.arange(n_obs)
         mat_col = np.zeros(n_obs, dtype=int)
         for i in range(n_obs):
             if total_count.dtype == int:
-                tmpidx = pair_index[(obs_count[i,s], total_count[i,s])]
+                tmpidx = pair_index[(obs_count[i, s], total_count[i, s])]
             else:
-                tmpidx = pair_index[(obs_count[i,s], total_count[i,s].round(decimals=4))]
+                tmpidx = pair_index[
+                    (obs_count[i, s], total_count[i, s].round(decimals=4))
+                ]
             mat_col[i] = tmpidx
-        mapping_matrices.append( scipy.sparse.csr_matrix((np.ones(len(mat_row)), (mat_row, mat_col) )) )
+        mapping_matrices.append(
+            scipy.sparse.csr_matrix((np.ones(len(mat_row)), (mat_row, mat_col)))
+        )
     return unique_values, mapping_matrices
 
 
-def initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=None, in_log_space=True, only_minor=True, min_binom_prob=0.1, max_binom_prob=0.9):
+def initialization_by_gmm(
+    n_states,
+    X,
+    base_nb_mean,
+    total_bb_RD,
+    params,
+    random_state=None,
+    in_log_space=True,
+    only_minor=True,
+    min_binom_prob=0.1,
+    max_binom_prob=0.9,
+):
     # prepare gmm input of RDR and BAF separately
     X_gmm_rdr = None
     X_gmm_baf = None
     if "m" in params:
         if in_log_space:
-            X_gmm_rdr = np.vstack([ np.log(X[:,0,s]/base_nb_mean[:,s]) for s in range(X.shape[2]) ]).T
+            X_gmm_rdr = np.vstack(
+                [np.log(X[:, 0, s] / base_nb_mean[:, s]) for s in range(X.shape[2])]
+            ).T
             offset = np.mean(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))])
-            normalizetomax1 = np.max(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]) - np.min(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))])
+            normalizetomax1 = np.max(
+                X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]
+            ) - np.min(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))])
             X_gmm_rdr = (X_gmm_rdr - offset) / normalizetomax1
         else:
-            X_gmm_rdr = np.vstack([ X[:,0,s]/base_nb_mean[:,s] for s in range(X.shape[2]) ]).T
+            X_gmm_rdr = np.vstack(
+                [X[:, 0, s] / base_nb_mean[:, s] for s in range(X.shape[2])]
+            ).T
             offset = 0
-            normalizetomax1 = np.max(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))])
+            normalizetomax1 = np.max(
+                X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]
+            )
             X_gmm_rdr = (X_gmm_rdr - offset) / normalizetomax1
     if "p" in params:
-        X_gmm_baf = np.vstack([ X[:,1,s] / total_bb_RD[:,s] for s in range(X.shape[2]) ]).T
+        X_gmm_baf = np.vstack(
+            [X[:, 1, s] / total_bb_RD[:, s] for s in range(X.shape[2])]
+        ).T
         X_gmm_baf[X_gmm_baf < min_binom_prob] = min_binom_prob
         X_gmm_baf[X_gmm_baf > max_binom_prob] = max_binom_prob
     # combine RDR and BAF
@@ -203,21 +232,31 @@ def initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random
     if random_state is None:
         gmm = GaussianMixture(n_components=n_states, max_iter=1).fit(X_gmm)
     else:
-        gmm = GaussianMixture(n_components=n_states, max_iter=1, random_state=random_state).fit(X_gmm)
+        gmm = GaussianMixture(
+            n_components=n_states, max_iter=1, random_state=random_state
+        ).fit(X_gmm)
     # turn gmm fitted parameters to HMM log_mu and p_binom parameters
     if ("m" in params) and ("p" in params):
-        gmm_log_mu = gmm.means_[:,:X.shape[2]] * normalizetomax1 + offset if in_log_space else np.log(gmm.means_[:,:X.shape[2]] * normalizetomax1 + offset)
-        gmm_p_binom = gmm.means_[:, X.shape[2]:]
+        gmm_log_mu = (
+            gmm.means_[:, : X.shape[2]] * normalizetomax1 + offset
+            if in_log_space
+            else np.log(gmm.means_[:, : X.shape[2]] * normalizetomax1 + offset)
+        )
+        gmm_p_binom = gmm.means_[:, X.shape[2] :]
         if only_minor:
-            gmm_p_binom = np.where(gmm_p_binom > 0.5, 1-gmm_p_binom, gmm_p_binom)
+            gmm_p_binom = np.where(gmm_p_binom > 0.5, 1 - gmm_p_binom, gmm_p_binom)
     elif "m" in params:
-        gmm_log_mu = gmm.means_ * normalizetomax1 + offset if in_log_space else np.log(gmm.means_[:,:X.shape[2]] * normalizetomax1 + offset)
+        gmm_log_mu = (
+            gmm.means_ * normalizetomax1 + offset
+            if in_log_space
+            else np.log(gmm.means_[:, : X.shape[2]] * normalizetomax1 + offset)
+        )
         gmm_p_binom = None
     elif "p" in params:
         gmm_log_mu = None
         gmm_p_binom = gmm.means_
         if only_minor:
-            gmm_p_binom = np.where(gmm_p_binom > 0.5, 1-gmm_p_binom, gmm_p_binom)
+            gmm_p_binom = np.where(gmm_p_binom > 0.5, 1 - gmm_p_binom, gmm_p_binom)
     return gmm_log_mu, gmm_p_binom
 
 
@@ -225,14 +264,15 @@ def initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random
 # E step related
 ############################################################
 
+
 def compute_posterior_obs(log_alpha, log_beta):
-    '''
+    """
     Input
         log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda).
         log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda).
     Output:
         log_gamma: size n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). gamma[i, t] propto alpha[i,t] * beta[i,t]
-    '''
+    """
     n_states = log_alpha.shape[0]
     n_obs = log_alpha.shape[1]
     # initial log_gamma
@@ -242,15 +282,17 @@ def compute_posterior_obs(log_alpha, log_beta):
     #     for t in np.arange(n_obs):
     #         log_gamma[j, t] = log_alpha[j, t] +  log_beta[j, t]
     log_gamma = log_alpha + log_beta
-    if np.any( np.sum(log_gamma, axis=0) == 0 ):
+    if np.any(np.sum(log_gamma, axis=0) == 0):
         raise Exception("Sum of posterior probability is zero for some observations!")
     log_gamma -= scipy.special.logsumexp(log_gamma, axis=0)
     return log_gamma
 
 
 @njit
-def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission):
-    '''
+def compute_posterior_transition_sitewise(
+    log_alpha, log_beta, log_transmat, log_emission
+):
+    """
     Input
         log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda).
         log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda).
@@ -258,27 +300,37 @@ def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log
         log_emission: n_states * n_observations * n_spots. Log probability.
     Output:
         log_xi: size n_states * n_states * (n_observations-1). xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda)
-    '''
+    """
     n_states = int(log_alpha.shape[0] / 2)
     n_obs = log_alpha.shape[1]
     # initialize log_xi
-    log_xi = np.zeros((2*n_states, 2*n_states, n_obs-1))
+    log_xi = np.zeros((2 * n_states, 2 * n_states, n_obs - 1))
     # compute log_xi
-    for i in np.arange(2*n_states):
-        for j in np.arange(2*n_states):
-            for t in np.arange(n_obs-1):
-                # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+    for i in np.arange(2 * n_states):
+        for j in np.arange(2 * n_states):
+            for t in np.arange(n_obs - 1):
+                # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
                 # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-                log_xi[i, j, t] = log_alpha[i, t] + log_transmat[i - n_states * int(i/n_states), j - n_states * int(j/n_states)] + np.sum(log_emission[j, t+1, :]) + log_beta[j, t+1]
+                log_xi[i, j, t] = (
+                    log_alpha[i, t]
+                    + log_transmat[
+                        i - n_states * int(i / n_states),
+                        j - n_states * int(j / n_states),
+                    ]
+                    + np.sum(log_emission[j, t + 1, :])
+                    + log_beta[j, t + 1]
+                )
     # normalize
-    for t in np.arange(n_obs-1):
+    for t in np.arange(n_obs - 1):
         log_xi[:, :, t] -= mylogsumexp(log_xi[:, :, t])
     return log_xi
 
 
 @njit
-def compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission):
-    '''
+def compute_posterior_transition_nophasing(
+    log_alpha, log_beta, log_transmat, log_emission
+):
+    """
     Input
         log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda).
         log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda).
@@ -286,20 +338,25 @@ def compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, lo
         log_emission: n_states * n_observations * n_spots. Log probability.
     Output:
         log_xi: size n_states * n_states * (n_observations-1). xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda)
-    '''
+    """
     n_states = int(log_alpha.shape[0] / 2)
     n_obs = log_alpha.shape[1]
     # initialize log_xi
-    log_xi = np.zeros((n_states, n_states, n_obs-1))
+    log_xi = np.zeros((n_states, n_states, n_obs - 1))
     # compute log_xi
     for i in np.arange(n_states):
         for j in np.arange(n_states):
-            for t in np.arange(n_obs-1):
-                # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
+            for t in np.arange(n_obs - 1):
+                # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities.
                 # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
-                log_xi[i, j, t] = log_alpha[i, t] + log_transmat[i, j] + np.sum(log_emission[j, t+1, :]) + log_beta[j, t+1]
+                log_xi[i, j, t] = (
+                    log_alpha[i, t]
+                    + log_transmat[i, j]
+                    + np.sum(log_emission[j, t + 1, :])
+                    + log_beta[j, t + 1]
+                )
     # normalize
-    for t in np.arange(n_obs-1):
+    for t in np.arange(n_obs - 1):
         log_xi[:, :, t] -= mylogsumexp(log_xi[:, :, t])
     return log_xi
 
@@ -308,18 +365,21 @@ def compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, lo
 # M step related (HMM phasing)
 ############################################################
 
+
 @njit
 def update_startprob_sitewise(lengths, log_gamma):
-    '''
+    """
     Input
         lengths: sum of lengths = n_observations.
         log_gamma: size 2 * n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda).
     Output
         log_startprob: n_states. Start probability after loog transformation.
-    '''
+    """
     n_states = int(log_gamma.shape[0] / 2)
     n_obs = log_gamma.shape[1]
-    assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the second dimension of log_gamma!"
+    assert (
+        np.sum(lengths) == n_obs
+    ), "Sum of lengths must be equal to the second dimension of log_gamma!"
     # indices of the start of sequences, given that the length of each sequence is in lengths
     cumlen = 0
     indices_start = []
@@ -332,7 +392,7 @@ def update_startprob_sitewise(lengths, log_gamma):
     # compute log_startprob of 2 * n_states
     log_startprob = mylogsumexp_ax_keep(log_gamma[:, indices_start], axis=1)
     # merge (CNV state, phase A) and (CNV state, phase B)
-    log_startprob = log_startprob.flatten().reshape(2,-1)
+    log_startprob = log_startprob.flatten().reshape(2, -1)
     log_startprob = mylogsumexp_ax_keep(log_startprob, axis=0)
     # normalize such that startprob sums to 1
     log_startprob -= mylogsumexp(log_startprob)
@@ -340,20 +400,28 @@ def update_startprob_sitewise(lengths, log_gamma):
 
 
 def update_transition_sitewise(log_xi, is_diag=False):
-    '''
+    """
     Input
         log_xi: size (2*n_states) * (2*n_states) * n_observations. xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda)
     Output
         log_transmat: n_states * n_states. Transition probability after log transformation.
-    '''
+    """
     n_states = int(log_xi.shape[0] / 2)
     n_obs = log_xi.shape[2]
     # initialize log_transmat
     log_transmat = np.zeros((n_states, n_states))
     for i in np.arange(n_states):
         for j in np.arange(n_states):
-            log_transmat[i, j] = scipy.special.logsumexp( np.concatenate([log_xi[i, j, :], log_xi[i+n_states, j, :], \
-                log_xi[i, j+n_states, :], log_xi[i + n_states, j + n_states, :]]) )
+            log_transmat[i, j] = scipy.special.logsumexp(
+                np.concatenate(
+                    [
+                        log_xi[i, j, :],
+                        log_xi[i + n_states, j, :],
+                        log_xi[i, j + n_states, :],
+                        log_xi[i + n_states, j + n_states, :],
+                    ]
+                )
+            )
     # row normalize log_transmat
     if not is_diag:
         for i in np.arange(n_states):
@@ -363,14 +431,25 @@ def update_transition_sitewise(log_xi, is_diag=False):
         diagsum = scipy.special.logsumexp(np.diag(log_transmat))
         totalsum = scipy.special.logsumexp(log_transmat)
         t = diagsum - totalsum
-        rest = np.log( (1 - np.exp(t)) / (n_states-1) )
+        rest = np.log((1 - np.exp(t)) / (n_states - 1))
         log_transmat = np.ones(log_transmat.shape) * rest
         np.fill_diagonal(log_transmat, t)
     return log_transmat
 
 
-def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2, min_estep_weight=0.1):
+def update_emission_params_nb_sitewise_uniqvalues(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    base_nb_mean,
+    alphas,
+    start_log_mu=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=False,
+    min_log_rdr=-2,
+    max_log_rdr=2,
+    min_estep_weight=0.1,
+):
     """
     Attributes
     ----------
@@ -387,41 +466,79 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
     # initialization
-    new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots))
+    new_log_mu = (
+        copy.copy(start_log_mu)
+        if not start_log_mu is None
+        else np.zeros((n_states, n_spots))
+    )
     new_alphas = copy.copy(alphas)
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
         for s in range(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
-                model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \
-                            exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero])
+                model = sm.GLM(
+                    unique_values[s][idx_nonzero, 0],
+                    np.ones(len(idx_nonzero)).reshape(-1, 1),
+                    family=sm.families.NegativeBinomial(alpha=alphas[i, s]),
+                    exposure=unique_values[s][idx_nonzero, 1],
+                    var_weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero],
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                 new_log_mu[i, s] = res.params[0]
                 if not (start_log_mu is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.array([start_log_mu[i, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_log_mu[i, s] = (
+                        res.params[0]
+                        if -model.loglike(res.params) < -model.loglike(res2.params)
+                        else res2.params[0]
+                    )
     else:
         if not shared_NB_dispersion:
             for s in range(n_spots):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
-                    model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \
-                                np.ones(len(idx_nonzero)).reshape(-1,1), \
-                                weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], \
-                                exposure=unique_values[s][idx_nonzero,1], \
-                                penalty=0)
+                    model = Weighted_NegativeBinomial(
+                        unique_values[s][idx_nonzero, 0],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero],
+                        exposure=unique_values[s][idx_nonzero, 1],
+                        penalty=0,
+                    )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_log_mu[i, s] = res.params[0]
                     new_alphas[i, s] = res.params[-1]
                     if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.append(
+                                [start_log_mu[i, s]], [alphas[i, s]]
+                            ),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_log_mu[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
+                        new_alphas[i, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
         else:
             exposure = []
             y = []
@@ -429,50 +546,93 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice
             features = []
             state_posweights = []
             for s in range(n_spots):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states)
-                this_y = np.tile(unique_values[s][idx_nonzero,0], n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
+                this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states)
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_weights = np.concatenate([ tmp[i,idx_nonzero] + tmp[i+n_states,idx_nonzero] for i in range(n_states) ])
-                this_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                this_weights = np.concatenate(
+                    [
+                        tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero]
+                        for i in range(n_states)
+                    ]
+                )
+                this_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= min_estep_weight ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1])
+                        >= min_estep_weight
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
-            model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure)
+            model = Weighted_NegativeBinomial(
+                y, features, weights=weights, exposure=exposure
+            )
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
+                new_alphas[:, :] = res.params[-1]
             if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_log_mu[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * alphas[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                        new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
-def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, tumor_prop, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+def update_emission_params_nb_sitewise_uniqvalues_mix(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    base_nb_mean,
+    alphas,
+    tumor_prop,
+    start_log_mu=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=False,
+    min_log_rdr=-2,
+    max_log_rdr=2,
+):
     """
     Attributes
     ----------
@@ -489,42 +649,85 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
     # initialization
-    new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots))
+    new_log_mu = (
+        copy.copy(start_log_mu)
+        if not start_log_mu is None
+        else np.zeros((n_states, n_spots))
+    )
     new_alphas = copy.copy(alphas)
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
         for s in range(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
-                model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \
-                            exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero])
+                model = sm.GLM(
+                    unique_values[s][idx_nonzero, 0],
+                    np.ones(len(idx_nonzero)).reshape(-1, 1),
+                    family=sm.families.NegativeBinomial(alpha=alphas[i, s]),
+                    exposure=unique_values[s][idx_nonzero, 1],
+                    var_weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero],
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                 new_log_mu[i, s] = res.params[0]
                 if not (start_log_mu is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.array([start_log_mu[i, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_log_mu[i, s] = (
+                        res.params[0]
+                        if -model.loglike(res.params) < -model.loglike(res2.params)
+                        else res2.params[0]
+                    )
     else:
         if not shared_NB_dispersion:
             for s in range(n_spots):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
-                    this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
-                    model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \
-                                np.ones(len(idx_nonzero)).reshape(-1,1), \
-                                weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \
-                                tumor_prop=this_tp)
-                                # tumor_prop=tumor_prop[s], penalty=0)
+                    this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[
+                        idx_nonzero
+                    ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ]
+                    model = Weighted_NegativeBinomial_mix(
+                        unique_values[s][idx_nonzero, 0],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero],
+                        exposure=unique_values[s][idx_nonzero, 1],
+                        tumor_prop=this_tp,
+                    )
+                    # tumor_prop=tumor_prop[s], penalty=0)
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_log_mu[i, s] = res.params[0]
                     new_alphas[i, s] = res.params[-1]
                     if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.append(
+                                [start_log_mu[i, s]], [alphas[i, s]]
+                            ),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_log_mu[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
+                        new_alphas[i, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
         else:
             exposure = []
             y = []
@@ -533,56 +736,108 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat
             state_posweights = []
             tp = []
             for s in range(n_spots):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states)
-                this_y = np.tile(unique_values[s][idx_nonzero,0], n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
+                this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states)
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states)
-                assert np.all(this_tp < 1+1e-4)
-                this_weights = np.concatenate([ tmp[i,idx_nonzero] + tmp[i+n_states,idx_nonzero] for i in range(n_states) ])
-                this_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                this_tp = np.tile(
+                    (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero]
+                    / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ],
+                    n_states,
+                )
+                assert np.all(this_tp < 1 + 1e-4)
+                this_weights = np.concatenate(
+                    [
+                        tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero]
+                        for i in range(n_states)
+                    ]
+                )
+                this_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
-                tp.append( this_tp[idx_row_posweight] )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
+                tp.append(this_tp[idx_row_posweight])
                 # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0)
+            model = Weighted_NegativeBinomial_mix(
+                y,
+                features,
+                weights=weights,
+                exposure=exposure,
+                tumor_prop=tp,
+                penalty=0,
+            )
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
+                new_alphas[:, :] = res.params[-1]
             if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_log_mu[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * alphas[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                        new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
-def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-    percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
+def update_emission_params_bb_sitewise_uniqvalues(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    total_bb_RD,
+    taus,
+    start_p_binom=None,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=False,
+    percent_threshold=0.99,
+    min_binom_prob=0.01,
+    max_binom_prob=0.99,
+):
     """
     Attributes
     ----------
@@ -599,44 +854,106 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
     # initialization
-    new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5
+    new_p_binom = (
+        copy.copy(start_p_binom)
+        if not start_p_binom is None
+        else np.ones((n_states, n_spots)) * 0.5
+    )
     new_taus = copy.copy(taus)
     if fix_BB_dispersion:
         for s in np.arange(len(unique_values)):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
                 # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1:
-                    model = Weighted_BetaBinom_fixdispersion(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
-                        np.ones(2*len(idx_nonzero)).reshape(-1,1), \
-                        taus[i,s], \
-                        weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
-                        exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]) )
+                if (
+                    np.sum(tmp[i, idx_nonzero]) + np.sum(tmp[i + n_states, idx_nonzero])
+                    >= 0.1
+                ):
+                    model = Weighted_BetaBinom_fixdispersion(
+                        np.append(
+                            unique_values[s][idx_nonzero, 0],
+                            unique_values[s][idx_nonzero, 1]
+                            - unique_values[s][idx_nonzero, 0],
+                        ),
+                        np.ones(2 * len(idx_nonzero)).reshape(-1, 1),
+                        taus[i, s],
+                        weights=np.append(
+                            tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero]
+                        ),
+                        exposure=np.append(
+                            unique_values[s][idx_nonzero, 1],
+                            unique_values[s][idx_nonzero, 1],
+                        ),
+                    )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_p_binom[i, s] = res.params[0]
                     if not (start_p_binom is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.array(start_p_binom[i, s]),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_p_binom[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
     else:
         if not shared_BB_dispersion:
             for s in np.arange(len(unique_values)):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
                     # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                    if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1:
-                        model = Weighted_BetaBinom(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
-                            np.ones(2*len(idx_nonzero)).reshape(-1,1), \
-                            weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
-                            exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]) )
+                    if (
+                        np.sum(tmp[i, idx_nonzero])
+                        + np.sum(tmp[i + n_states, idx_nonzero])
+                        >= 0.1
+                    ):
+                        model = Weighted_BetaBinom(
+                            np.append(
+                                unique_values[s][idx_nonzero, 0],
+                                unique_values[s][idx_nonzero, 1]
+                                - unique_values[s][idx_nonzero, 0],
+                            ),
+                            np.ones(2 * len(idx_nonzero)).reshape(-1, 1),
+                            weights=np.append(
+                                tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero]
+                            ),
+                            exposure=np.append(
+                                unique_values[s][idx_nonzero, 1],
+                                unique_values[s][idx_nonzero, 1],
+                            ),
+                        )
                         res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                         new_p_binom[i, s] = res.params[0]
                         new_taus[i, s] = res.params[-1]
                         if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            res2 = model.fit(
+                                disp=0,
+                                maxiter=1500,
+                                start_params=np.append(
+                                    [start_p_binom[i, s]], [taus[i, s]]
+                                ),
+                                xtol=1e-4,
+                                ftol=1e-4,
+                            )
+                            new_p_binom[i, s] = (
+                                res.params[0]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[0]
+                            )
+                            new_taus[i, s] = (
+                                res.params[-1]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[-1]
+                            )
         else:
             exposure = []
             y = []
@@ -644,51 +961,104 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice
             features = []
             state_posweights = []
             for s in np.arange(len(unique_values)):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile( np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), n_states)
-                this_y = np.tile( np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(
+                    np.append(
+                        unique_values[s][idx_nonzero, 1],
+                        unique_values[s][idx_nonzero, 1],
+                    ),
+                    n_states,
+                )
+                this_y = np.tile(
+                    np.append(
+                        unique_values[s][idx_nonzero, 0],
+                        unique_values[s][idx_nonzero, 1]
+                        - unique_values[s][idx_nonzero, 0],
+                    ),
+                    n_states,
+                )
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_weights = np.concatenate([ np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]) for i in range(n_states) ])
-                this_features = np.zeros((2*n_states*len(idx_nonzero), n_states))
+                this_weights = np.concatenate(
+                    [
+                        np.append(tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero])
+                        for i in range(n_states)
+                    ]
+                )
+                this_features = np.zeros((2 * n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*2*len(idx_nonzero)):((i+1)*2*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * 2 * len(idx_nonzero)) : ((i + 1) * 2 * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
+                new_taus[:, :] = res.params[-1]
             if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_p_binom[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * taus[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                        new_taus[:, :] = res2.params[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
 
 
-def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, tumor_prop, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-    percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
+def update_emission_params_bb_sitewise_uniqvalues_mix(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    total_bb_RD,
+    taus,
+    tumor_prop,
+    start_p_binom=None,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=False,
+    percent_threshold=0.99,
+    min_binom_prob=0.01,
+    max_binom_prob=0.99,
+):
     """
     Attributes
     ----------
@@ -705,52 +1075,122 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
     # initialization
-    new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5
+    new_p_binom = (
+        copy.copy(start_p_binom)
+        if not start_p_binom is None
+        else np.ones((n_states, n_spots)) * 0.5
+    )
     new_taus = copy.copy(taus)
     if fix_BB_dispersion:
         for s in np.arange(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
                 # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1:
-                    this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
-                    assert np.all(this_tp < 1+1e-4)
-                    model = Weighted_BetaBinom_fixdispersion_mix(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
-                        np.ones(2*len(idx_nonzero)).reshape(-1,1), \
-                        taus[i,s], \
-                        weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
-                        exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), \
-                        tumor_prop=this_tp)
-                        # tumor_prop=tumor_prop[s] )
+                if (
+                    np.sum(tmp[i, idx_nonzero]) + np.sum(tmp[i + n_states, idx_nonzero])
+                    >= 0.1
+                ):
+                    this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[
+                        idx_nonzero
+                    ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ]
+                    assert np.all(this_tp < 1 + 1e-4)
+                    model = Weighted_BetaBinom_fixdispersion_mix(
+                        np.append(
+                            unique_values[s][idx_nonzero, 0],
+                            unique_values[s][idx_nonzero, 1]
+                            - unique_values[s][idx_nonzero, 0],
+                        ),
+                        np.ones(2 * len(idx_nonzero)).reshape(-1, 1),
+                        taus[i, s],
+                        weights=np.append(
+                            tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero]
+                        ),
+                        exposure=np.append(
+                            unique_values[s][idx_nonzero, 1],
+                            unique_values[s][idx_nonzero, 1],
+                        ),
+                        tumor_prop=this_tp,
+                    )
+                    # tumor_prop=tumor_prop[s] )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_p_binom[i, s] = res.params[0]
                     if not (start_p_binom is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.array(start_p_binom[i, s]),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_p_binom[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
     else:
         if not shared_BB_dispersion:
             for s in np.arange(n_spots):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
                     # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                    if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1:
-                        this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
-                        assert np.all(this_tp < 1+1e-4)
-                        model = Weighted_BetaBinom_mix(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \
-                            np.ones(2*len(idx_nonzero)).reshape(-1,1), \
-                            weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \
-                            exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),\
-                            tumor_prop=this_tp)
-                            # tumor_prop=tumor_prop )
+                    if (
+                        np.sum(tmp[i, idx_nonzero])
+                        + np.sum(tmp[i + n_states, idx_nonzero])
+                        >= 0.1
+                    ):
+                        this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[
+                            idx_nonzero
+                        ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                            idx_nonzero
+                        ]
+                        assert np.all(this_tp < 1 + 1e-4)
+                        model = Weighted_BetaBinom_mix(
+                            np.append(
+                                unique_values[s][idx_nonzero, 0],
+                                unique_values[s][idx_nonzero, 1]
+                                - unique_values[s][idx_nonzero, 0],
+                            ),
+                            np.ones(2 * len(idx_nonzero)).reshape(-1, 1),
+                            weights=np.append(
+                                tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero]
+                            ),
+                            exposure=np.append(
+                                unique_values[s][idx_nonzero, 1],
+                                unique_values[s][idx_nonzero, 1],
+                            ),
+                            tumor_prop=this_tp,
+                        )
+                        # tumor_prop=tumor_prop )
                         res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                         new_p_binom[i, s] = res.params[0]
                         new_taus[i, s] = res.params[-1]
                         if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            res2 = model.fit(
+                                disp=0,
+                                maxiter=1500,
+                                start_params=np.append(
+                                    [start_p_binom[i, s]], [taus[i, s]]
+                                ),
+                                xtol=1e-4,
+                                ftol=1e-4,
+                            )
+                            new_p_binom[i, s] = (
+                                res.params[0]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[0]
+                            )
+                            new_taus[i, s] = (
+                                res.params[-1]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[-1]
+                            )
         else:
             exposure = []
             y = []
@@ -759,48 +1199,98 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat
             state_posweights = []
             tp = []
             for s in np.arange(n_spots):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile( np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), n_states)
-                this_y = np.tile( np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(
+                    np.append(
+                        unique_values[s][idx_nonzero, 1],
+                        unique_values[s][idx_nonzero, 1],
+                    ),
+                    n_states,
+                )
+                this_y = np.tile(
+                    np.append(
+                        unique_values[s][idx_nonzero, 0],
+                        unique_values[s][idx_nonzero, 1]
+                        - unique_values[s][idx_nonzero, 0],
+                    ),
+                    n_states,
+                )
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states)
-                assert np.all(this_tp < 1+1e-4)
-                this_weights = np.concatenate([ np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]) for i in range(n_states) ])
-                this_features = np.zeros((2*n_states*len(idx_nonzero), n_states))
+                this_tp = np.tile(
+                    (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero]
+                    / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ],
+                    n_states,
+                )
+                assert np.all(this_tp < 1 + 1e-4)
+                this_weights = np.concatenate(
+                    [
+                        np.append(tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero])
+                        for i in range(n_states)
+                    ]
+                )
+                this_features = np.zeros((2 * n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*2*len(idx_nonzero)):((i+1)*2*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * 2 * len(idx_nonzero)) : ((i + 1) * 2 * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
-                tp.append( this_tp[idx_row_posweight] )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
+                tp.append(this_tp[idx_row_posweight])
                 # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp)
+            model = Weighted_BetaBinom_mix(
+                y, features, weights=weights, exposure=exposure, tumor_prop=tp
+            )
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
+                new_taus[:, :] = res.params[-1]
             if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_p_binom[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * taus[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                        new_taus[:, :] = res2.params[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
@@ -811,16 +1301,18 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat
 ############################################################
 @njit
 def update_startprob_nophasing(lengths, log_gamma):
-    '''
+    """
     Input
         lengths: sum of lengths = n_observations.
         log_gamma: size n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda).
     Output
         log_startprob: n_states. Start probability after loog transformation.
-    '''
+    """
     n_states = log_gamma.shape[0]
     n_obs = log_gamma.shape[1]
-    assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the second dimension of log_gamma!"
+    assert (
+        np.sum(lengths) == n_obs
+    ), "Sum of lengths must be equal to the second dimension of log_gamma!"
     # indices of the start of sequences, given that the length of each sequence is in lengths
     cumlen = 0
     indices_start = []
@@ -838,19 +1330,19 @@ def update_startprob_nophasing(lengths, log_gamma):
 
 
 def update_transition_nophasing(log_xi, is_diag=False):
-    '''
+    """
     Input
         log_xi: size (n_states) * (n_states) * n_observations. xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda)
     Output
         log_transmat: n_states * n_states. Transition probability after log transformation.
-    '''
+    """
     n_states = log_xi.shape[0]
     n_obs = log_xi.shape[2]
     # initialize log_transmat
     log_transmat = np.zeros((n_states, n_states))
     for i in np.arange(n_states):
         for j in np.arange(n_states):
-            log_transmat[i, j] = scipy.special.logsumexp( log_xi[i, j, :] )
+            log_transmat[i, j] = scipy.special.logsumexp(log_xi[i, j, :])
     # row normalize log_transmat
     if not is_diag:
         for i in np.arange(n_states):
@@ -860,14 +1352,23 @@ def update_transition_nophasing(log_xi, is_diag=False):
         diagsum = scipy.special.logsumexp(np.diag(log_transmat))
         totalsum = scipy.special.logsumexp(log_transmat)
         t = diagsum - totalsum
-        rest = np.log( (1 - np.exp(t)) / (n_states-1) )
+        rest = np.log((1 - np.exp(t)) / (n_states - 1))
         log_transmat = np.ones(log_transmat.shape) * rest
         np.fill_diagonal(log_transmat, t)
     return log_transmat
 
 
-def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, alphas, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+def update_emission_params_nb_nophasing_uniqvalues(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    alphas,
+    start_log_mu=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=False,
+    min_log_rdr=-2,
+    max_log_rdr=2,
+):
     """
     Attributes
     ----------
@@ -884,41 +1385,79 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
     # initialization
-    new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots))
+    new_log_mu = (
+        copy.copy(start_log_mu)
+        if not start_log_mu is None
+        else np.zeros((n_states, n_spots))
+    )
     new_alphas = copy.copy(alphas)
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
         for s in range(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
-                model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \
-                            exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero])
+                model = sm.GLM(
+                    unique_values[s][idx_nonzero, 0],
+                    np.ones(len(idx_nonzero)).reshape(-1, 1),
+                    family=sm.families.NegativeBinomial(alpha=alphas[i, s]),
+                    exposure=unique_values[s][idx_nonzero, 1],
+                    var_weights=tmp[i, idx_nonzero],
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                 new_log_mu[i, s] = res.params[0]
                 if not (start_log_mu is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.array([start_log_mu[i, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_log_mu[i, s] = (
+                        res.params[0]
+                        if -model.loglike(res.params) < -model.loglike(res2.params)
+                        else res2.params[0]
+                    )
     else:
         if not shared_NB_dispersion:
             for s in range(n_spots):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
-                    model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \
-                                np.ones(len(idx_nonzero)).reshape(-1,1), \
-                                weights=tmp[i,idx_nonzero], \
-                                exposure=unique_values[s][idx_nonzero,1], \
-                                penalty=0)
+                    model = Weighted_NegativeBinomial(
+                        unique_values[s][idx_nonzero, 0],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        weights=tmp[i, idx_nonzero],
+                        exposure=unique_values[s][idx_nonzero, 1],
+                        penalty=0,
+                    )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_log_mu[i, s] = res.params[0]
                     new_alphas[i, s] = res.params[-1]
                     if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.append(
+                                [start_log_mu[i, s]], [alphas[i, s]]
+                            ),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_log_mu[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
+                        new_alphas[i, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
         else:
             exposure = []
             y = []
@@ -926,50 +1465,88 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric
             features = []
             state_posweights = []
             for s in range(n_spots):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states)
-                this_y = np.tile(unique_values[s][idx_nonzero,0], n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
+                this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states)
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ])
-                this_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                this_weights = np.concatenate(
+                    [tmp[i, idx_nonzero] for i in range(n_states)]
+                )
+                this_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
-            model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure)
+            model = Weighted_NegativeBinomial(
+                y, features, weights=weights, exposure=exposure
+            )
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
+                new_alphas[:, :] = res.params[-1]
             if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_log_mu[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * alphas[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                        new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
-def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, alphas, tumor_prop, \
-    start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2):
+def update_emission_params_nb_nophasing_uniqvalues_mix(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    alphas,
+    tumor_prop,
+    start_log_mu=None,
+    fix_NB_dispersion=False,
+    shared_NB_dispersion=False,
+    min_log_rdr=-2,
+    max_log_rdr=2,
+):
     """
     Attributes
     ----------
@@ -986,42 +1563,85 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
     # initialization
-    new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots))
+    new_log_mu = (
+        copy.copy(start_log_mu)
+        if not start_log_mu is None
+        else np.zeros((n_states, n_spots))
+    )
     new_alphas = copy.copy(alphas)
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
         for s in range(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
-                model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \
-                            exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero])
+                model = sm.GLM(
+                    unique_values[s][idx_nonzero, 0],
+                    np.ones(len(idx_nonzero)).reshape(-1, 1),
+                    family=sm.families.NegativeBinomial(alpha=alphas[i, s]),
+                    exposure=unique_values[s][idx_nonzero, 1],
+                    var_weights=tmp[i, idx_nonzero],
+                )
                 res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                 new_log_mu[i, s] = res.params[0]
                 if not (start_log_mu is None):
-                    res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4)
-                    new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0]
+                    res2 = model.fit(
+                        disp=0,
+                        maxiter=1500,
+                        start_params=np.array([start_log_mu[i, s]]),
+                        xtol=1e-4,
+                        ftol=1e-4,
+                    )
+                    new_log_mu[i, s] = (
+                        res.params[0]
+                        if -model.loglike(res.params) < -model.loglike(res2.params)
+                        else res2.params[0]
+                    )
     else:
         if not shared_NB_dispersion:
             for s in range(n_spots):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
-                    this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
-                    model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \
-                                np.ones(len(idx_nonzero)).reshape(-1,1), \
-                                weights=tmp[i,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \
-                                tumor_prop=this_tp)
-                                # tumor_prop=tumor_prop[s], penalty=0)
+                    this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[
+                        idx_nonzero
+                    ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ]
+                    model = Weighted_NegativeBinomial_mix(
+                        unique_values[s][idx_nonzero, 0],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        weights=tmp[i, idx_nonzero],
+                        exposure=unique_values[s][idx_nonzero, 1],
+                        tumor_prop=this_tp,
+                    )
+                    # tumor_prop=tumor_prop[s], penalty=0)
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_log_mu[i, s] = res.params[0]
                     new_alphas[i, s] = res.params[-1]
                     if not (start_log_mu is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4)
-                        new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                        new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.append(
+                                [start_log_mu[i, s]], [alphas[i, s]]
+                            ),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_log_mu[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
+                        new_alphas[i, s] = (
+                            res.params[-1]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[-1]
+                        )
         else:
             exposure = []
             y = []
@@ -1030,56 +1650,104 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma
             state_posweights = []
             tp = []
             for s in range(n_spots):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states)
-                this_y = np.tile(unique_values[s][idx_nonzero,0], n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
+                this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states)
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states)
+                this_tp = np.tile(
+                    (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero]
+                    / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ],
+                    n_states,
+                )
                 assert np.all(this_tp < 1 + 1e-4)
-                this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ])
-                this_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                this_weights = np.concatenate(
+                    [tmp[i, idx_nonzero] for i in range(n_states)]
+                )
+                this_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
-                tp.append( this_tp[idx_row_posweight] )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
+                tp.append(this_tp[idx_row_posweight])
                 # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0)
+            model = Weighted_NegativeBinomial_mix(
+                y,
+                features,
+                weights=weights,
+                exposure=exposure,
+                tumor_prop=tp,
+                penalty=0,
+            )
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_alphas[:,:] = res.params[-1]
+                new_alphas[:, :] = res.params[-1]
             if not (start_log_mu is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_log_mu[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * alphas[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_alphas[:,:] = res2.params[-1]
+                        new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
     return new_log_mu, new_alphas
 
 
-def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, taus, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-    percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
+def update_emission_params_bb_nophasing_uniqvalues(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    taus,
+    start_p_binom=None,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=False,
+    percent_threshold=0.99,
+    min_binom_prob=0.01,
+    max_binom_prob=0.99,
+):
     """
     Attributes
     ----------
@@ -1096,44 +1764,81 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
     # initialization
-    new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5
+    new_p_binom = (
+        copy.copy(start_p_binom)
+        if not start_p_binom is None
+        else np.ones((n_states, n_spots)) * 0.5
+    )
     new_taus = copy.copy(taus)
     if fix_BB_dispersion:
         for s in np.arange(len(unique_values)):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
                 # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                if np.sum(tmp[i,idx_nonzero]) >= 0.1:
-                    model = Weighted_BetaBinom_fixdispersion(unique_values[s][idx_nonzero,0], \
-                        np.ones(len(idx_nonzero)).reshape(-1,1), \
-                        taus[i,s], \
-                        weights=tmp[i,idx_nonzero], \
-                        exposure=unique_values[s][idx_nonzero,1] )
+                if np.sum(tmp[i, idx_nonzero]) >= 0.1:
+                    model = Weighted_BetaBinom_fixdispersion(
+                        unique_values[s][idx_nonzero, 0],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        taus[i, s],
+                        weights=tmp[i, idx_nonzero],
+                        exposure=unique_values[s][idx_nonzero, 1],
+                    )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_p_binom[i, s] = res.params[0]
                     if not (start_p_binom is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.array(start_p_binom[i, s]),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_p_binom[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
     else:
         if not shared_BB_dispersion:
             for s in np.arange(len(unique_values)):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
                     # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                    if np.sum(tmp[i,idx_nonzero]) >= 0.1:
-                        model = Weighted_BetaBinom(unique_values[s][idx_nonzero,0], \
-                            np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            weights=tmp[i,idx_nonzero], \
-                            exposure=unique_values[s][idx_nonzero,1] )
+                    if np.sum(tmp[i, idx_nonzero]) >= 0.1:
+                        model = Weighted_BetaBinom(
+                            unique_values[s][idx_nonzero, 0],
+                            np.ones(len(idx_nonzero)).reshape(-1, 1),
+                            weights=tmp[i, idx_nonzero],
+                            exposure=unique_values[s][idx_nonzero, 1],
+                        )
                         res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                         new_p_binom[i, s] = res.params[0]
                         new_taus[i, s] = res.params[-1]
                         if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            res2 = model.fit(
+                                disp=0,
+                                maxiter=1500,
+                                start_params=np.append(
+                                    [start_p_binom[i, s]], [taus[i, s]]
+                                ),
+                                xtol=1e-4,
+                                ftol=1e-4,
+                            )
+                            new_p_binom[i, s] = (
+                                res.params[0]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[0]
+                            )
+                            new_taus[i, s] = (
+                                res.params[-1]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[-1]
+                            )
         else:
             exposure = []
             y = []
@@ -1141,52 +1846,88 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric
             features = []
             state_posweights = []
             for s in np.arange(len(unique_values)):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile( unique_values[s][idx_nonzero,1], n_states)
-                this_y = np.tile( unique_values[s][idx_nonzero,0], n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
+                this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states)
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ])
-                this_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                this_weights = np.concatenate(
+                    [tmp[i, idx_nonzero] for i in range(n_states)]
+                )
+                this_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
+                new_taus[:, :] = res.params[-1]
             if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_p_binom[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * taus[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                        new_taus[:, :] = res2.params[-1]
 
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
 
 
-def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, taus, tumor_prop, \
-    start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \
-    percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99):
+def update_emission_params_bb_nophasing_uniqvalues_mix(
+    unique_values,
+    mapping_matrices,
+    log_gamma,
+    taus,
+    tumor_prop,
+    start_p_binom=None,
+    fix_BB_dispersion=False,
+    shared_BB_dispersion=False,
+    percent_threshold=0.99,
+    min_binom_prob=0.01,
+    max_binom_prob=0.99,
+):
     """
     Attributes
     ----------
@@ -1203,52 +1944,97 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
     # initialization
-    new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5
+    new_p_binom = (
+        copy.copy(start_p_binom)
+        if not start_p_binom is None
+        else np.ones((n_states, n_spots)) * 0.5
+    )
     new_taus = copy.copy(taus)
     if fix_BB_dispersion:
         for s in np.arange(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-            idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+            idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
             for i in range(n_states):
                 # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                if np.sum(tmp[i,idx_nonzero]) >= 0.1:
-                    this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
+                if np.sum(tmp[i, idx_nonzero]) >= 0.1:
+                    this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[
+                        idx_nonzero
+                    ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ]
                     assert np.all(this_tp < 1 + 1e-4)
-                    model = Weighted_BetaBinom_fixdispersion_mix(unique_values[s][idx_nonzero,0], \
-                        np.ones(len(idx_nonzero)).reshape(-1,1), \
-                        taus[i,s], \
-                        weights=tmp[i,idx_nonzero], \
-                        exposure=unique_values[s][idx_nonzero,1], \
-                        tumor_prop=this_tp)
-                        # tumor_prop=tumor_prop[s] )
+                    model = Weighted_BetaBinom_fixdispersion_mix(
+                        unique_values[s][idx_nonzero, 0],
+                        np.ones(len(idx_nonzero)).reshape(-1, 1),
+                        taus[i, s],
+                        weights=tmp[i, idx_nonzero],
+                        exposure=unique_values[s][idx_nonzero, 1],
+                        tumor_prop=this_tp,
+                    )
+                    # tumor_prop=tumor_prop[s] )
                     res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                     new_p_binom[i, s] = res.params[0]
                     if not (start_p_binom is None):
-                        res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4)
-                        new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
+                        res2 = model.fit(
+                            disp=0,
+                            maxiter=1500,
+                            start_params=np.array(start_p_binom[i, s]),
+                            xtol=1e-4,
+                            ftol=1e-4,
+                        )
+                        new_p_binom[i, s] = (
+                            res.params[0]
+                            if model.nloglikeobs(res.params)
+                            < model.nloglikeobs(res2.params)
+                            else res2.params[0]
+                        )
     else:
         if not shared_BB_dispersion:
             for s in np.arange(n_spots):
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 for i in range(n_states):
                     # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state)
-                    if np.sum(tmp[i,idx_nonzero]) >= 0.1:
-                        this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero]
+                    if np.sum(tmp[i, idx_nonzero]) >= 0.1:
+                        this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[
+                            idx_nonzero
+                        ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                            idx_nonzero
+                        ]
                         assert np.all(this_tp < 1 + 1e-4)
-                        model = Weighted_BetaBinom_mix(unique_values[s][idx_nonzero,0], \
-                            np.ones(len(idx_nonzero)).reshape(-1,1), \
-                            weights=tmp[i,idx_nonzero], \
-                            exposure=unique_values[s][idx_nonzero,1], \
-                            tumor_prop=this_tp)
-                            # tumor_prop=tumor_prop[s] )
+                        model = Weighted_BetaBinom_mix(
+                            unique_values[s][idx_nonzero, 0],
+                            np.ones(len(idx_nonzero)).reshape(-1, 1),
+                            weights=tmp[i, idx_nonzero],
+                            exposure=unique_values[s][idx_nonzero, 1],
+                            tumor_prop=this_tp,
+                        )
+                        # tumor_prop=tumor_prop[s] )
                         res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
                         new_p_binom[i, s] = res.params[0]
                         new_taus[i, s] = res.params[-1]
                         if not (start_p_binom is None):
-                            res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4)
-                            new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0]
-                            new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1]
+                            res2 = model.fit(
+                                disp=0,
+                                maxiter=1500,
+                                start_params=np.append(
+                                    [start_p_binom[i, s]], [taus[i, s]]
+                                ),
+                                xtol=1e-4,
+                                ftol=1e-4,
+                            )
+                            new_p_binom[i, s] = (
+                                res.params[0]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[0]
+                            )
+                            new_taus[i, s] = (
+                                res.params[-1]
+                                if model.nloglikeobs(res.params)
+                                < model.nloglikeobs(res2.params)
+                                else res2.params[-1]
+                            )
         else:
             exposure = []
             y = []
@@ -1257,49 +2043,82 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma
             state_posweights = []
             tp = []
             for s in np.arange(n_spots):
-                idx_nonzero = np.where(unique_values[s][:,1] > 0)[0]
-                this_exposure = np.tile( unique_values[s][idx_nonzero,1], n_states)
-                this_y = np.tile( unique_values[s][idx_nonzero,0], n_states)
+                idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
+                this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
+                this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states)
                 tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
-                this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states)
+                this_tp = np.tile(
+                    (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero]
+                    / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[
+                        idx_nonzero
+                    ],
+                    n_states,
+                )
                 assert np.all(this_tp < 1 + 1e-4)
-                this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ])
-                this_features = np.zeros((n_states*len(idx_nonzero), n_states))
+                this_weights = np.concatenate(
+                    [tmp[i, idx_nonzero] for i in range(n_states)]
+                )
+                this_features = np.zeros((n_states * len(idx_nonzero), n_states))
                 for i in np.arange(n_states):
-                    this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1
+                    this_features[
+                        (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i
+                    ] = 1
                 # only optimize for states where at least 1 SNP belongs to
-                idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ])
-                idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ])
-                y.append( this_y[idx_row_posweight] )
-                exposure.append( this_exposure[idx_row_posweight] )
-                weights.append( this_weights[idx_row_posweight] )
-                features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] )
-                state_posweights.append( idx_state_posweight )
-                tp.append( this_tp[idx_row_posweight] )
+                idx_state_posweight = np.array(
+                    [
+                        i
+                        for i in range(this_features.shape[1])
+                        if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1
+                    ]
+                )
+                idx_row_posweight = np.concatenate(
+                    [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight]
+                )
+                y.append(this_y[idx_row_posweight])
+                exposure.append(this_exposure[idx_row_posweight])
+                weights.append(this_weights[idx_row_posweight])
+                features.append(
+                    this_features[idx_row_posweight, :][:, idx_state_posweight]
+                )
+                state_posweights.append(idx_state_posweight)
+                tp.append(this_tp[idx_row_posweight])
                 # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
-            model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp)
+            model = Weighted_BetaBinom_mix(
+                y, features, weights=weights, exposure=exposure, tumor_prop=tp
+            )
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
-            for s,idx_state_posweight in enumerate(state_posweights):
-                l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+            for s, idx_state_posweight in enumerate(state_posweights):
+                l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
             if res.params[-1] > 0:
-                new_taus[:,:] = res.params[-1]
+                new_taus[:, :] = res.params[-1]
             if not (start_p_binom is None):
-                res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4)
+                res2 = model.fit(
+                    disp=0,
+                    maxiter=1500,
+                    start_params=np.concatenate(
+                        [
+                            start_p_binom[idx_state_posweight, s]
+                            for s, idx_state_posweight in enumerate(state_posweights)
+                        ]
+                        + [np.ones(1) * taus[0, s]]
+                    ),
+                    xtol=1e-4,
+                    ftol=1e-4,
+                )
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
-                    for s,idx_state_posweight in enumerate(state_posweights):
-                        l1 = int( np.sum([len(x) for x in state_posweights[:s]]) )
-                        l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) )
+                    for s, idx_state_posweight in enumerate(state_posweights):
+                        l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
+                        l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
                     if res2.params[-1] > 0:
-                        new_taus[:,:] = res2.params[-1]
+                        new_taus[:, :] = res2.params[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
     return new_p_binom, new_taus
-
diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py
index bee9f42..13c6830 100644
--- a/src/calicost/utils_hmrf.py
+++ b/src/calicost/utils_hmrf.py
@@ -13,13 +13,15 @@
 
 def compute_adjacency_mat(coords, unit_xsquared=9, unit_ysquared=3):
     # pairwise distance
-    x_dist = coords[:,0][None,:] - coords[:,0][:,None]
-    y_dist = coords[:,1][None,:] - coords[:,1][:,None]
+    x_dist = coords[:, 0][None, :] - coords[:, 0][:, None]
+    y_dist = coords[:, 1][None, :] - coords[:, 1][:, None]
     pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared
     # adjacency
-    A = np.zeros( (coords.shape[0], coords.shape[0]), dtype=np.int8 )
+    A = np.zeros((coords.shape[0], coords.shape[0]), dtype=np.int8)
     for i in range(coords.shape[0]):
-        indexes = np.where(pairwise_squared_dist[i,:] <= unit_xsquared + unit_ysquared)[0]
+        indexes = np.where(
+            pairwise_squared_dist[i, :] <= unit_xsquared + unit_ysquared
+        )[0]
         indexes = np.array([j for j in indexes if j != i])
         if len(indexes) > 0:
             A[i, indexes] = 1
@@ -29,13 +31,15 @@ def compute_adjacency_mat(coords, unit_xsquared=9, unit_ysquared=3):
 
 def compute_adjacency_mat_v2(coords, unit_xsquared=9, unit_ysquared=3, ratio=1):
     # pairwise distance
-    x_dist = coords[:,0][None,:] - coords[:,0][:,None]
-    y_dist = coords[:,1][None,:] - coords[:,1][:,None]
+    x_dist = coords[:, 0][None, :] - coords[:, 0][:, None]
+    y_dist = coords[:, 1][None, :] - coords[:, 1][:, None]
     pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared
     # adjacency
-    A = np.zeros( (coords.shape[0], coords.shape[0]), dtype=np.int8 )
+    A = np.zeros((coords.shape[0], coords.shape[0]), dtype=np.int8)
     for i in range(coords.shape[0]):
-        indexes = np.where(pairwise_squared_dist[i,:] <= ratio * (unit_xsquared + unit_ysquared))[0]
+        indexes = np.where(
+            pairwise_squared_dist[i, :] <= ratio * (unit_xsquared + unit_ysquared)
+        )[0]
         indexes = np.array([j for j in indexes if j != i])
         if len(indexes) > 0:
             A[i, indexes] = 1
@@ -43,44 +47,60 @@ def compute_adjacency_mat_v2(coords, unit_xsquared=9, unit_ysquared=3, ratio=1):
     return A
 
 
-def compute_weighted_adjacency(coords, unit_xsquared=9, unit_ysquared=3, bandwidth=12, decay=5):
+def compute_weighted_adjacency(
+    coords, unit_xsquared=9, unit_ysquared=3, bandwidth=12, decay=5
+):
     # pairwise distance
-    x_dist = coords[:,0][None,:] - coords[:,0][:,None]
-    y_dist = coords[:,1][None,:] - coords[:,1][:,None]
+    x_dist = coords[:, 0][None, :] - coords[:, 0][:, None]
+    y_dist = coords[:, 1][None, :] - coords[:, 1][:, None]
     pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared
-    kern = np.exp(-(pairwise_squared_dist / bandwidth)**decay)
+    kern = np.exp(-((pairwise_squared_dist / bandwidth) ** decay))
     # adjacency
-    A = np.zeros( (coords.shape[0], coords.shape[0]) )
+    A = np.zeros((coords.shape[0], coords.shape[0]))
     for i in range(coords.shape[0]):
-        indexes = np.where(kern[i,:] > 1e-4)[0]
+        indexes = np.where(kern[i, :] > 1e-4)[0]
         indexes = np.array([j for j in indexes if j != i])
         if len(indexes) > 0:
-            A[i, indexes] = kern[i,indexes]
+            A[i, indexes] = kern[i, indexes]
     A = scipy.sparse.csr_matrix(A)
     return A
 
 
-def choose_adjacency_by_readcounts(coords, single_total_bb_RD, maxspots_pooling=7, unit_xsquared=9, unit_ysquared=3):
-# def choose_adjacency_by_readcounts(coords, single_total_bb_RD, count_threshold=4000, unit_xsquared=9, unit_ysquared=3):
+def choose_adjacency_by_readcounts(
+    coords, single_total_bb_RD, maxspots_pooling=7, unit_xsquared=9, unit_ysquared=3
+):
+    # def choose_adjacency_by_readcounts(coords, single_total_bb_RD, count_threshold=4000, unit_xsquared=9, unit_ysquared=3):
     # XXX: change from count_threshold 500 to 3000
     # pairwise distance
-    x_dist = coords[:,0][None,:] - coords[:,0][:,None]
-    y_dist = coords[:,1][None,:] - coords[:,1][:,None]
+    x_dist = coords[:, 0][None, :] - coords[:, 0][:, None]
+    y_dist = coords[:, 1][None, :] - coords[:, 1][:, None]
     tmp_pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared
     np.fill_diagonal(tmp_pairwise_squared_dist, np.max(tmp_pairwise_squared_dist))
-    base_ratio = np.median(np.min(tmp_pairwise_squared_dist, axis=0)) / (unit_xsquared + unit_ysquared)
+    base_ratio = np.median(np.min(tmp_pairwise_squared_dist, axis=0)) / (
+        unit_xsquared + unit_ysquared
+    )
     s_ratio = 0
     for ratio in range(0, 10):
-        smooth_mat = compute_adjacency_mat_v2(coords, unit_xsquared, unit_ysquared, ratio * base_ratio)
+        smooth_mat = compute_adjacency_mat_v2(
+            coords, unit_xsquared, unit_ysquared, ratio * base_ratio
+        )
         smooth_mat.setdiag(1)
         if np.median(np.sum(smooth_mat > 0, axis=0).A.flatten()) > maxspots_pooling:
             s_ratio = ratio - 1
             break
         s_ratio = ratio
-    smooth_mat = compute_adjacency_mat_v2(coords, unit_xsquared, unit_ysquared, s_ratio * base_ratio)
+    smooth_mat = compute_adjacency_mat_v2(
+        coords, unit_xsquared, unit_ysquared, s_ratio * base_ratio
+    )
     smooth_mat.setdiag(1)
-    for bandwidth in np.arange(unit_xsquared + unit_ysquared, 15*(unit_xsquared + unit_ysquared), unit_xsquared + unit_ysquared):
-        adjacency_mat = compute_weighted_adjacency(coords, unit_xsquared, unit_ysquared, bandwidth=bandwidth)
+    for bandwidth in np.arange(
+        unit_xsquared + unit_ysquared,
+        15 * (unit_xsquared + unit_ysquared),
+        unit_xsquared + unit_ysquared,
+    ):
+        adjacency_mat = compute_weighted_adjacency(
+            coords, unit_xsquared, unit_ysquared, bandwidth=bandwidth
+        )
         adjacency_mat.setdiag(1)
         adjacency_mat = adjacency_mat - smooth_mat
         adjacency_mat[adjacency_mat < 0] = 0
@@ -93,7 +113,7 @@ def choose_adjacency_by_readcounts(coords, single_total_bb_RD, maxspots_pooling=
 def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7):
     """
     Compute adjacency matrix for pooling and for HMRF by KNN of pairwise spatial distance + pairwise expression distance.
-    
+
     Attributes
     ----------
     coords : array, shape (n_spots, 2)
@@ -111,32 +131,48 @@ def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7):
     n_spots = coords.shape[0]
 
     # pairwise expression distance if exp_counts is not None
-    pair_exp_dist = scipy.sparse.csr_matrix( np.zeros((n_spots,n_spots)) )
+    pair_exp_dist = scipy.sparse.csr_matrix(np.zeros((n_spots, n_spots)))
     scaling_factor = 1
     if not exp_counts is None:
-        adata = anndata.AnnData( pd.DataFrame(exp_counts) )
-        sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) )
+        adata = anndata.AnnData(pd.DataFrame(exp_counts))
+        sc.pp.normalize_total(
+            adata, target_sum=np.median(np.sum(exp_counts.values, axis=1))
+        )
         sc.pp.log1p(adata)
         sc.tl.pca(adata)
-        pair_exp_dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(adata.obsm["X_pca"]))
+        pair_exp_dist = scipy.spatial.distance.squareform(
+            scipy.spatial.distance.pdist(adata.obsm["X_pca"])
+        )
         # compute the scaling factor to normalize coords such that it has the same sum of variance as PCA
         var_coord = np.sum(np.var(coords, axis=0))
         var_pca = np.sum(np.var(adata.obsm["X_pca"], axis=0))
         EPS = 1e-4
-        scaling_factor = np.sqrt(var_coord / var_pca) if var_coord > EPS and var_pca > EPS else 1
+        scaling_factor = (
+            np.sqrt(var_coord / var_pca) if var_coord > EPS and var_pca > EPS else 1
+        )
 
     # pairwise spatial distance
-    pair_spatial_dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(coords / scaling_factor))
+    pair_spatial_dist = scipy.spatial.distance.squareform(
+        scipy.spatial.distance.pdist(coords / scaling_factor)
+    )
 
     # adjacency for pooling
-    smooth_mat = NearestNeighbors(n_neighbors=maxspots_pooling, metric='precomputed').fit(w * pair_spatial_dist + (1-w) * pair_exp_dist).kneighbors_graph()
-    smooth_mat.setdiag(1) # include self adjacency
+    smooth_mat = (
+        NearestNeighbors(n_neighbors=maxspots_pooling, metric="precomputed")
+        .fit(w * pair_spatial_dist + (1 - w) * pair_exp_dist)
+        .kneighbors_graph()
+    )
+    smooth_mat.setdiag(1)  # include self adjacency
 
     # adjacency for HMRF
-    adjacency_mat = NearestNeighbors(n_neighbors=maxspots_pooling + 6, metric='precomputed').fit(w * pair_spatial_dist + (1-w) * pair_exp_dist).kneighbors_graph()
+    adjacency_mat = (
+        NearestNeighbors(n_neighbors=maxspots_pooling + 6, metric="precomputed")
+        .fit(w * pair_spatial_dist + (1 - w) * pair_exp_dist)
+        .kneighbors_graph()
+    )
     adjacency_mat = adjacency_mat - smooth_mat
     adjacency_mat[adjacency_mat < 0] = 0
-    adjacency_mat.setdiag(1) # include self adjacency
+    adjacency_mat.setdiag(1)  # include self adjacency
     return smooth_mat, adjacency_mat
 
 
@@ -150,34 +186,53 @@ def choose_adjacency_by_readcounts_slidedna(coords, maxspots_pooling=30):
     return smooth_mat, adjacency_mat
 
 
-def multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, across_slice_adjacency_mat, construct_adjacency_method, maxspots_pooling, construct_adjacency_w):
+def multislice_adjacency(
+    sample_ids,
+    sample_list,
+    coords,
+    single_total_bb_RD,
+    exp_counts,
+    across_slice_adjacency_mat,
+    construct_adjacency_method,
+    maxspots_pooling,
+    construct_adjacency_w,
+):
     adjacency_mat = []
     smooth_mat = []
-    for i,sname in enumerate(sample_list):
+    for i, sname in enumerate(sample_list):
         index = np.where(sample_ids == i)[0]
-        this_coords = np.array(coords[index,:])
+        this_coords = np.array(coords[index, :])
         if construct_adjacency_method == "hexagon":
-            tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts(this_coords, single_total_bb_RD[:,index], maxspots_pooling=maxspots_pooling)
+            tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts(
+                this_coords,
+                single_total_bb_RD[:, index],
+                maxspots_pooling=maxspots_pooling,
+            )
         elif construct_adjacency_method == "KNN":
-            tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN(this_coords, exp_counts.iloc[index,:], w=construct_adjacency_w, maxspots_pooling=maxspots_pooling)
+            tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN(
+                this_coords,
+                exp_counts.iloc[index, :],
+                w=construct_adjacency_w,
+                maxspots_pooling=maxspots_pooling,
+            )
         else:
-            raise("Unknown adjacency construction method")
+            raise ("Unknown adjacency construction method")
         # tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts_slidedna(this_coords, maxspots_pooling=config["maxspots_pooling"])
-        adjacency_mat.append( tmpadjacency_mat.A )
-        smooth_mat.append( tmpsmooth_mat.A )
+        adjacency_mat.append(tmpadjacency_mat.A)
+        smooth_mat.append(tmpsmooth_mat.A)
     adjacency_mat = scipy.linalg.block_diag(*adjacency_mat)
-    adjacency_mat = scipy.sparse.csr_matrix( adjacency_mat )
+    adjacency_mat = scipy.sparse.csr_matrix(adjacency_mat)
     if not across_slice_adjacency_mat is None:
         adjacency_mat += across_slice_adjacency_mat
     smooth_mat = scipy.linalg.block_diag(*smooth_mat)
-    smooth_mat = scipy.sparse.csr_matrix( smooth_mat )
+    smooth_mat = scipy.sparse.csr_matrix(smooth_mat)
     return adjacency_mat, smooth_mat
 
 
 def rectangle_initialize_initial_clone(coords, n_clones, random_state=0):
     """
     Initialize clone assignment by partition space into p * p blocks (s.t. p * p >= n_clones), and assign each block a clone id.
-    
+
     Attributes
     ----------
     coords : array, shape (n_spots, 2)
@@ -194,18 +249,18 @@ def rectangle_initialize_initial_clone(coords, n_clones, random_state=0):
     np.random.seed(random_state)
     p = int(np.ceil(np.sqrt(n_clones)))
     # partition the range of x and y axes
-    px = np.random.dirichlet( np.ones(p) * 10 )
+    px = np.random.dirichlet(np.ones(p) * 10)
     px[-1] += 1e-4
-    xrange = [np.percentile(coords[:,0], 5), np.percentile(coords[:,0], 95)]
+    xrange = [np.percentile(coords[:, 0], 5), np.percentile(coords[:, 0], 95)]
     xboundary = xrange[0] + (xrange[1] - xrange[0]) * np.cumsum(px)
-    xboundary[-1] = np.max(coords[:,0]) + 1
-    xdigit = np.digitize(coords[:,0], xboundary, right=True)
-    py = np.random.dirichlet( np.ones(p) * 10 )
+    xboundary[-1] = np.max(coords[:, 0]) + 1
+    xdigit = np.digitize(coords[:, 0], xboundary, right=True)
+    py = np.random.dirichlet(np.ones(p) * 10)
     py[-1] += 1e-4
-    yrange = [np.percentile(coords[:,1], 5), np.percentile(coords[:,1], 95)]
+    yrange = [np.percentile(coords[:, 1], 5), np.percentile(coords[:, 1], 95)]
     yboundary = yrange[0] + (yrange[1] - yrange[0]) * np.cumsum(py)
-    yboundary[-1] = np.max(coords[:,1]) + 1
-    ydigit = np.digitize(coords[:,1], yboundary, right=True)
+    yboundary[-1] = np.max(coords[:, 1]) + 1
+    ydigit = np.digitize(coords[:, 1], yboundary, right=True)
     block_id = xdigit * p + ydigit
     # assigning blocks to clone (note that if sqrt(n_clone) is not an integer, multiple blocks can be assigneed to one clone)
     # block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2)
@@ -220,109 +275,137 @@ def rectangle_initialize_initial_clone(coords, n_clones, random_state=0):
         block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2)
         while len(np.unique(block_clone_map)) < n_clones:
             bc = np.bincount(block_clone_map, minlength=n_clones)
-            assert np.any(bc==0)
-            block_clone_map[np.where(block_clone_map==np.argmax(bc))[0][0]] = np.where(bc==0)[0][0]
-        block_clone_map = {i:block_clone_map[i] for i in range(len(block_clone_map))}
+            assert np.any(bc == 0)
+            block_clone_map[np.where(block_clone_map == np.argmax(bc))[0][0]] = (
+                np.where(bc == 0)[0][0]
+            )
+        block_clone_map = {i: block_clone_map[i] for i in range(len(block_clone_map))}
         clone_id = np.array([block_clone_map[i] for i in block_id])
         initial_clone_index = [np.where(clone_id == i)[0] for i in range(n_clones)]
-        if np.min([len(x) for x in initial_clone_index]) > 0.2 * coords.shape[0] / n_clones:
+        if (
+            np.min([len(x) for x in initial_clone_index])
+            > 0.2 * coords.shape[0] / n_clones
+        ):
             break
     return initial_clone_index
 
 
 def fixed_rectangle_initialization(coords, x_part, y_part):
     #
-    px = np.linspace(0, 1, x_part+1)
+    px = np.linspace(0, 1, x_part + 1)
     px[-1] += 0.01
     px = px[1:]
-    xrange = [np.min(coords[:,0]), np.max(coords[:,0])]
-    xdigit = np.digitize(coords[:,0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True)
+    xrange = [np.min(coords[:, 0]), np.max(coords[:, 0])]
+    xdigit = np.digitize(
+        coords[:, 0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True
+    )
     #
-    py = np.linspace(0, 1, y_part+1)
+    py = np.linspace(0, 1, y_part + 1)
     py[-1] += 0.01
     py = py[1:]
-    yrange = [np.min(coords[:,1]), np.max(coords[:,1])]
-    ydigit = np.digitize(coords[:,1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True)
+    yrange = [np.min(coords[:, 1]), np.max(coords[:, 1])]
+    ydigit = np.digitize(
+        coords[:, 1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True
+    )
     #
     initial_clone_index = []
     for xid in range(x_part):
         for yid in range(y_part):
-            initial_clone_index.append( np.where((xdigit == xid) & (ydigit == yid))[0] )
+            initial_clone_index.append(np.where((xdigit == xid) & (ydigit == yid))[0])
     return initial_clone_index
 
 
-def merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index):
+def merge_pseudobulk_by_index(
+    single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+):
     n_obs = single_X.shape[0]
     n_spots = len(clone_index)
     X = np.zeros((n_obs, 2, n_spots))
     base_nb_mean = np.zeros((n_obs, n_spots))
     total_bb_RD = np.zeros((n_obs, n_spots))
 
-    for k,idx in enumerate(clone_index):
+    for k, idx in enumerate(clone_index):
         if len(idx) == 0:
             continue
-        X[:,:, k] = np.sum(single_X[:,:,idx], axis=2)
+        X[:, :, k] = np.sum(single_X[:, :, idx], axis=2)
         base_nb_mean[:, k] = np.sum(single_base_nb_mean[:, idx], axis=1)
         total_bb_RD[:, k] = np.sum(single_total_bb_RD[:, idx], axis=1)
 
     return X, base_nb_mean, total_bb_RD
 
 
-def rectangle_initialize_initial_clone_mix(coords, n_clones, single_tumor_prop, threshold=0.5, random_state=0, EPS=1e-8):
+def rectangle_initialize_initial_clone_mix(
+    coords, n_clones, single_tumor_prop, threshold=0.5, random_state=0, EPS=1e-8
+):
     np.random.seed(random_state)
     p = int(np.ceil(np.sqrt(n_clones)))
     # partition the range of x and y axes based on tumor spots coordinates
     idx_tumor = np.where(single_tumor_prop > threshold)[0]
-    px = np.random.dirichlet( np.ones(p) * 10 )
+    px = np.random.dirichlet(np.ones(p) * 10)
     px[-1] -= EPS
-    xboundary = np.percentile(coords[idx_tumor, 0], 100*np.cumsum(px))
-    xboundary[-1] = np.max(coords[:,0]) + 1
-    xdigit = np.digitize(coords[:,0], xboundary, right=True)
+    xboundary = np.percentile(coords[idx_tumor, 0], 100 * np.cumsum(px))
+    xboundary[-1] = np.max(coords[:, 0]) + 1
+    xdigit = np.digitize(coords[:, 0], xboundary, right=True)
     ydigit = np.zeros(coords.shape[0], dtype=int)
     for x in range(p):
-        idx_tumor = np.where((single_tumor_prop > threshold) & (xdigit==x))[0]
+        idx_tumor = np.where((single_tumor_prop > threshold) & (xdigit == x))[0]
         idx_both = np.where(xdigit == x)[0]
-        py = np.random.dirichlet( np.ones(p) * 10 )
+        py = np.random.dirichlet(np.ones(p) * 10)
         py[-1] -= EPS
-        yboundary = np.percentile(coords[idx_tumor, 1], 100*np.cumsum(py))
-        yboundary[-1] = np.max(coords[:,1]) + 1
-        ydigit[idx_both] = np.digitize(coords[idx_both,1], yboundary, right=True)
+        yboundary = np.percentile(coords[idx_tumor, 1], 100 * np.cumsum(py))
+        yboundary[-1] = np.max(coords[:, 1]) + 1
+        ydigit[idx_both] = np.digitize(coords[idx_both, 1], yboundary, right=True)
     block_id = xdigit * p + ydigit
     # assigning blocks to clone (note that if sqrt(n_clone) is not an integer, multiple blocks can be assigneed to one clone)
     block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2)
     while len(np.unique(block_clone_map)) < n_clones:
         bc = np.bincount(block_clone_map, minlength=n_clones)
-        assert np.any(bc==0)
-        block_clone_map[np.where(block_clone_map==np.argmax(bc))[0][0]] = np.where(bc==0)[0][0]
-    block_clone_map = {i:block_clone_map[i] for i in range(len(block_clone_map))}
+        assert np.any(bc == 0)
+        block_clone_map[np.where(block_clone_map == np.argmax(bc))[0][0]] = np.where(
+            bc == 0
+        )[0][0]
+    block_clone_map = {i: block_clone_map[i] for i in range(len(block_clone_map))}
     clone_id = np.array([block_clone_map[i] for i in block_id])
     initial_clone_index = [np.where(clone_id == i)[0] for i in range(n_clones)]
     return initial_clone_index
 
 
-def fixed_rectangle_initialization_mix(coords, x_part, y_part, single_tumor_prop, threshold=0.5):
+def fixed_rectangle_initialization_mix(
+    coords, x_part, y_part, single_tumor_prop, threshold=0.5
+):
     idx_tumor = np.where(single_tumor_prop > threshold)[0]
     #
-    px = np.linspace(0, 1, x_part+1)
+    px = np.linspace(0, 1, x_part + 1)
     px[-1] += 0.01
     px = px[1:]
-    xrange = [np.min(coords[idx_tumor,0]), np.max(coords[idx_tumor,0])]
-    xdigit = np.digitize(coords[:,0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True)
+    xrange = [np.min(coords[idx_tumor, 0]), np.max(coords[idx_tumor, 0])]
+    xdigit = np.digitize(
+        coords[:, 0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True
+    )
     #
-    py = np.linspace(0, 1, y_part+1)
+    py = np.linspace(0, 1, y_part + 1)
     py[-1] += 0.01
     py = py[1:]
-    yrange = [np.min(coords[idx_tumor,1]), np.max(coords[idx_tumor,1])]
-    ydigit = np.digitize(coords[:,1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True)
+    yrange = [np.min(coords[idx_tumor, 1]), np.max(coords[idx_tumor, 1])]
+    ydigit = np.digitize(
+        coords[:, 1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True
+    )
     #
     initial_clone_index = []
     for xid in range(x_part):
         for yid in range(y_part):
-            initial_clone_index.append( np.where((xdigit == xid) & (ydigit == yid))[0] )
+            initial_clone_index.append(np.where((xdigit == xid) & (ydigit == yid))[0])
     return initial_clone_index
 
 
-def merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=0.5):
+def merge_pseudobulk_by_index_mix(
+    single_X,
+    single_base_nb_mean,
+    single_total_bb_RD,
+    clone_index,
+    single_tumor_prop,
+    threshold=0.5,
+):
     n_obs = single_X.shape[0]
     n_spots = len(clone_index)
     X = np.zeros((n_obs, 2, n_spots))
@@ -330,11 +413,11 @@ def merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb
     total_bb_RD = np.zeros((n_obs, n_spots))
     tumor_prop = np.zeros(n_spots)
 
-    for k,idx in enumerate(clone_index):
+    for k, idx in enumerate(clone_index):
         if len(idx) == 0:
             continue
         idx = idx[np.where(single_tumor_prop[idx] > threshold)[0]]
-        X[:,:, k] = np.sum(single_X[:,:,idx], axis=2)
+        X[:, :, k] = np.sum(single_X[:, :, idx], axis=2)
         base_nb_mean[:, k] = np.sum(single_base_nb_mean[:, idx], axis=1)
         total_bb_RD[:, k] = np.sum(single_total_bb_RD[:, idx], axis=1)
         tumor_prop[k] = np.mean(single_tumor_prop[idx]) if len(idx) > 0 else 0
@@ -352,13 +435,19 @@ def reorder_results(res_combine, posterior, single_tumor_prop):
     if single_tumor_prop is None:
         # select near-normal clone and set to clone 0
         pred_cnv = res_combine["pred_cnv"]
-        baf_profiles = np.array([ res_combine["new_p_binom"][pred_cnv[:,c], c] for c in range(n_clones) ])
-        cid_normal = np.argmin(np.sum( np.maximum(np.abs(baf_profiles - 0.5)-EPS_BAF, 0), axis=1))
+        baf_profiles = np.array(
+            [res_combine["new_p_binom"][pred_cnv[:, c], c] for c in range(n_clones)]
+        )
+        cid_normal = np.argmin(
+            np.sum(np.maximum(np.abs(baf_profiles - 0.5) - EPS_BAF, 0), axis=1)
+        )
         cid_rest = np.array([c for c in range(n_clones) if c != cid_normal]).astype(int)
         reidx = np.append(cid_normal, cid_rest)
-        map_reidx = {cid:i for i,cid in enumerate(reidx)}
+        map_reidx = {cid: i for i, cid in enumerate(reidx)}
         # re-order entries in res_combine
-        new_res_combine["new_assignment"] = np.array([ map_reidx[c] for c in res_combine["new_assignment"] ])
+        new_res_combine["new_assignment"] = np.array(
+            [map_reidx[c] for c in res_combine["new_assignment"]]
+        )
         new_res_combine["new_log_mu"] = res_combine["new_log_mu"][:, reidx]
         new_res_combine["new_alphas"] = res_combine["new_alphas"][:, reidx]
         new_res_combine["new_p_binom"] = res_combine["new_p_binom"][:, reidx]
@@ -369,59 +458,109 @@ def reorder_results(res_combine, posterior, single_tumor_prop):
     else:
         # add normal clone as clone 0
         new_res_combine["new_assignment"] = new_res_combine["new_assignment"] + 1
-        new_res_combine["new_log_mu"] = np.hstack([np.zeros((n_states,1)), res_combine["new_log_mu"]])
-        new_res_combine["new_alphas"] = np.hstack([np.zeros((n_states,1)), res_combine["new_alphas"]])
-        new_res_combine["new_p_binom"] = np.hstack([0.5 * np.ones((n_states,1)), res_combine["new_p_binom"]])
-        new_res_combine["new_taus"] = np.hstack([np.zeros((n_states,1)), res_combine["new_taus"]])
-        new_res_combine["log_gamma"] = np.dstack([np.zeros((n_states, n_obs, 1)), res_combine["log_gamma"]])
-        new_res_combine["pred_cnv"] = np.hstack([np.zeros((n_obs,1), dtype=int), res_combine["pred_cnv"]])
-        new_posterior = np.hstack([np.ones((n_spots,1)) * np.nan, posterior])
+        new_res_combine["new_log_mu"] = np.hstack(
+            [np.zeros((n_states, 1)), res_combine["new_log_mu"]]
+        )
+        new_res_combine["new_alphas"] = np.hstack(
+            [np.zeros((n_states, 1)), res_combine["new_alphas"]]
+        )
+        new_res_combine["new_p_binom"] = np.hstack(
+            [0.5 * np.ones((n_states, 1)), res_combine["new_p_binom"]]
+        )
+        new_res_combine["new_taus"] = np.hstack(
+            [np.zeros((n_states, 1)), res_combine["new_taus"]]
+        )
+        new_res_combine["log_gamma"] = np.dstack(
+            [np.zeros((n_states, n_obs, 1)), res_combine["log_gamma"]]
+        )
+        new_res_combine["pred_cnv"] = np.hstack(
+            [np.zeros((n_obs, 1), dtype=int), res_combine["pred_cnv"]]
+        )
+        new_posterior = np.hstack([np.ones((n_spots, 1)) * np.nan, posterior])
     return new_res_combine, new_posterior
 
 
 def reorder_results_merged(res, n_obs):
     n_clones = int(len(res["pred_cnv"]) / n_obs)
     EPS_BAF = 0.05
-    pred_cnv = np.array([ res["pred_cnv"][(c*n_obs):(c*n_obs + n_obs)] for c in range(n_clones) ]).T
-    baf_profiles = np.array([ res["new_p_binom"][pred_cnv[:,c], 0] for c in range(n_clones) ])
-    cid_normal = np.argmin(np.sum( np.maximum(np.abs(baf_profiles - 0.5)-EPS_BAF, 0), axis=1))
+    pred_cnv = np.array(
+        [res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_clones)]
+    ).T
+    baf_profiles = np.array(
+        [res["new_p_binom"][pred_cnv[:, c], 0] for c in range(n_clones)]
+    )
+    cid_normal = np.argmin(
+        np.sum(np.maximum(np.abs(baf_profiles - 0.5) - EPS_BAF, 0), axis=1)
+    )
     cid_rest = np.array([c for c in range(n_clones) if c != cid_normal])
     reidx = np.append(cid_normal, cid_rest)
-    map_reidx = {cid:i for i,cid in enumerate(reidx)}
+    map_reidx = {cid: i for i, cid in enumerate(reidx)}
     # re-order entries in res
     new_res = copy.copy(res)
-    new_res["new_assignment"] = np.array([ map_reidx[c] for c in res["new_assignment"] ])
-    new_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c*n_obs):(c*n_obs + n_obs)] for c in reidx ])
-    new_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c*n_obs):(c*n_obs + n_obs)] for c in reidx ])
+    new_res["new_assignment"] = np.array([map_reidx[c] for c in res["new_assignment"]])
+    new_res["log_gamma"] = np.hstack(
+        [res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)] for c in reidx]
+    )
+    new_res["pred_cnv"] = np.concatenate(
+        [res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] for c in reidx]
+    )
     return new_res
-    
+
 
 def load_hmrf_last_iteration(filename):
-    allres = dict( np.load(filename, allow_pickle=True) )
+    allres = dict(np.load(filename, allow_pickle=True))
     r = allres["num_iterations"] - 1
-    res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-        "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-        "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-        "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-        "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
+    res = {
+        "new_log_mu": allres[f"round{r}_new_log_mu"],
+        "new_alphas": allres[f"round{r}_new_alphas"],
+        "new_p_binom": allres[f"round{r}_new_p_binom"],
+        "new_taus": allres[f"round{r}_new_taus"],
+        "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+        "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+        "log_gamma": allres[f"round{r}_log_gamma"],
+        "pred_cnv": allres[f"round{r}_pred_cnv"],
+        "llf": allres[f"round{r}_llf"],
+        "total_llf": allres[f"round{r}_total_llf"],
+        "prev_assignment": allres[f"round{r-1}_assignment"],
+        "new_assignment": allres[f"round{r}_assignment"],
+    }
     if "barcodes" in allres.keys():
         res["barcodes"] = allres["barcodes"]
     return res
 
 
 def load_hmrf_given_iteration(filename, r):
-    allres = dict( np.load(filename, allow_pickle=True) )
-    res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \
-        "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \
-        "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \
-        "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \
-        "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]}
+    allres = dict(np.load(filename, allow_pickle=True))
+    res = {
+        "new_log_mu": allres[f"round{r}_new_log_mu"],
+        "new_alphas": allres[f"round{r}_new_alphas"],
+        "new_p_binom": allres[f"round{r}_new_p_binom"],
+        "new_taus": allres[f"round{r}_new_taus"],
+        "new_log_startprob": allres[f"round{r}_new_log_startprob"],
+        "new_log_transmat": allres[f"round{r}_new_log_transmat"],
+        "log_gamma": allres[f"round{r}_log_gamma"],
+        "pred_cnv": allres[f"round{r}_pred_cnv"],
+        "llf": allres[f"round{r}_llf"],
+        "total_llf": allres[f"round{r}_total_llf"],
+        "prev_assignment": allres[f"round{r-1}_assignment"],
+        "new_assignment": allres[f"round{r}_assignment"],
+    }
     if "barcodes" in allres.keys():
         res["barcodes"] = allres["barcodes"]
     return res
 
 
-def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv, p_binom, min_count, EPS_BAF=0.05, COUNT_QUANTILE=0.05, MIN_TOTAL=10):
+def identify_normal_spots(
+    single_X,
+    single_total_bb_RD,
+    new_assignment,
+    pred_cnv,
+    p_binom,
+    min_count,
+    EPS_BAF=0.05,
+    COUNT_QUANTILE=0.05,
+    MIN_TOTAL=10,
+):
     """
     Attributes
     ----------
@@ -443,29 +582,47 @@ def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv
     n_spots = single_X.shape[2]
     n_clones = int(len(pred_cnv) / n_obs)
     n_states = p_binom.shape[0]
-    reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order='F')
+    reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order="F")
 
     baf_profiles = p_binom[reshaped_pred_cnv, 0].T
-    id_nearnormal_clone = np.argmin(np.sum( np.maximum(np.abs(baf_profiles - 0.5)-EPS_BAF, 0), axis=1))
-    umi_quantile = np.quantile(np.sum(single_X[:,0,:], axis=0), COUNT_QUANTILE)
-    
+    id_nearnormal_clone = np.argmin(
+        np.sum(np.maximum(np.abs(baf_profiles - 0.5) - EPS_BAF, 0), axis=1)
+    )
+    umi_quantile = np.quantile(np.sum(single_X[:, 0, :], axis=0), COUNT_QUANTILE)
+
     baf_deviations = np.ones(n_spots)
     for i in range(n_spots):
-        if new_assignment[i] == id_nearnormal_clone and np.sum(single_X[:,0,i]) >= umi_quantile:
+        if (
+            new_assignment[i] == id_nearnormal_clone
+            and np.sum(single_X[:, 0, i]) >= umi_quantile
+        ):
             # enumerate the partition of all clones to aggregate counts, and list the BAF of each partition
             this_bafs = []
             for c in range(n_clones):
-                agg_b_count = np.array([ np.sum(single_X[reshaped_pred_cnv[:,c]==s, 1, i]) for s in range(n_states) ])
-                agg_t_count = np.array([ np.sum(single_total_bb_RD[reshaped_pred_cnv[:,c]==s, i]) for s in range(n_states) ])
-                this_bafs.append( agg_b_count[agg_t_count>=MIN_TOTAL] / agg_t_count[agg_t_count>=MIN_TOTAL] )
+                agg_b_count = np.array(
+                    [
+                        np.sum(single_X[reshaped_pred_cnv[:, c] == s, 1, i])
+                        for s in range(n_states)
+                    ]
+                )
+                agg_t_count = np.array(
+                    [
+                        np.sum(single_total_bb_RD[reshaped_pred_cnv[:, c] == s, i])
+                        for s in range(n_states)
+                    ]
+                )
+                this_bafs.append(
+                    agg_b_count[agg_t_count >= MIN_TOTAL]
+                    / agg_t_count[agg_t_count >= MIN_TOTAL]
+                )
             this_bafs = np.concatenate(this_bafs)
             baf_deviations[i] = np.max(np.abs(this_bafs - 0.5))
 
     sorted_idx = np.argsort(baf_deviations)
-    summed_counts = np.cumsum( np.sum(single_X[:,0,sorted_idx], axis=0) )
+    summed_counts = np.cumsum(np.sum(single_X[:, 0, sorted_idx], axis=0))
     n_normal = np.where(summed_counts >= min_count)[0][0]
 
-    return (baf_deviations <= baf_deviations[sorted_idx[n_normal]])
+    return baf_deviations <= baf_deviations[sorted_idx[n_normal]]
 
 
 # def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_candidate, MIN_BAF_DEVIATION_RANGE=[0.25, 0.12], MIN_BINS_PER_STATE=10, MIN_BINS_ALL=50):
@@ -477,7 +634,7 @@ def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv
 
 #     new_assignment : array, shape (n_spots,)
 #         Clone assignment for each spot.
-    
+
 #     pred_cnv : array, shape (n_obs * n_clones)
 #         Copy number states across bins for each clone.
 
@@ -532,17 +689,29 @@ def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv
 #     Update ideas: why not finding high purity clone and loh states together by varying BAF deviation threshold?
 #     Current we first identify high purity clone using BAF deviation threshold = 0.15, then identify loh states.
 #     But we can vary BAF deviation threshold from the large to small, identify high purity clones and loh states based on the same threshold.
-#     At very large threshold value, there will be no high purity clone, which is unreasonable. 
+#     At very large threshold value, there will be no high purity clone, which is unreasonable.
 #     While lowering the threshold, purity clone(s) will appear, and we terminate once we are able to find one high purity clone.
 
-#     Another update idea: identification of loh states is unaware of RDR. 
+#     Another update idea: identification of loh states is unaware of RDR.
 #     We can first find low-copy-number loh states first by thresholding RDR. If we can't find any, increase RDR threshold.
 #     """
 
 #     return loh_states, is_B_lost, rdr_values, clones_hightumor
 
 
-def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_candidate, single_total_bb_RD, MIN_SNPUMI=10, MAX_RDR=1, MIN_BAF_DEVIATION_RANGE=[0.25, 0.12], MIN_BINS_PER_STATE=10, MIN_BINS_ALL=25):
+def identify_loh_per_clone(
+    single_X,
+    new_assignment,
+    pred_cnv,
+    p_binom,
+    normal_candidate,
+    single_total_bb_RD,
+    MIN_SNPUMI=10,
+    MAX_RDR=1,
+    MIN_BAF_DEVIATION_RANGE=[0.25, 0.12],
+    MIN_BINS_PER_STATE=10,
+    MIN_BINS_ALL=25,
+):
     """
     Attributes
     ----------
@@ -551,7 +720,7 @@ def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_c
 
     new_assignment : array, shape (n_spots,)
         Clone assignment for each spot.
-    
+
     pred_cnv : array, shape (n_obs * n_clones)
         Copy number states across bins for each clone.
 
@@ -572,38 +741,66 @@ def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_c
     n_obs = single_X.shape[0]
     n_clones = int(len(pred_cnv) / n_obs)
     n_states = p_binom.shape[0]
-    reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order='F')
-    
+    reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order="F")
+
     # per-state RDR values
     # first get the normal baseline expression per spot per bin
-    simple_rdr_normal = np.sum(single_X[:, 0, (normal_candidate==True)], axis=1)
+    simple_rdr_normal = np.sum(single_X[:, 0, (normal_candidate == True)], axis=1)
     simple_rdr_normal = simple_rdr_normal / np.sum(simple_rdr_normal)
-    simple_single_base_nb_mean = simple_rdr_normal.reshape(-1,1) @ np.sum(single_X[:,0,:], axis=0).reshape(1,-1)
+    simple_single_base_nb_mean = simple_rdr_normal.reshape(-1, 1) @ np.sum(
+        single_X[:, 0, :], axis=0
+    ).reshape(1, -1)
     # then aggregate to clones
     clone_index = [np.where(new_assignment == c)[0] for c in range(n_clones)]
-    X, base_nb_mean, _ = merge_pseudobulk_by_index(single_X, simple_single_base_nb_mean, np.zeros(simple_single_base_nb_mean.shape), clone_index)
+    X, base_nb_mean, _ = merge_pseudobulk_by_index(
+        single_X,
+        simple_single_base_nb_mean,
+        np.zeros(simple_single_base_nb_mean.shape),
+        clone_index,
+    )
     rdr_values = []
     for s in np.arange(n_states):
-        rdr_values.append( np.sum(X[:,0,:][reshaped_pred_cnv==s]) / np.sum(base_nb_mean[reshaped_pred_cnv==s]) )
+        rdr_values.append(
+            np.sum(X[:, 0, :][reshaped_pred_cnv == s])
+            / np.sum(base_nb_mean[reshaped_pred_cnv == s])
+        )
     rdr_values = np.array(rdr_values)
 
     # SNP-covering UMI per clone
-    clone_snpumi = np.array([np.sum(single_total_bb_RD[:,new_assignment==c]) for c in range(n_clones)])
+    clone_snpumi = np.array(
+        [np.sum(single_total_bb_RD[:, new_assignment == c]) for c in range(n_clones)]
+    )
 
     # clones that have a decent tumor proportion
     # for each clone, if the clones_hightumor-th BAF deviation is large enough
-    k_baf_deviation = np.sort( np.abs(p_binom[reshaped_pred_cnv, 0]-0.5), axis=0)[-MIN_BINS_ALL,:]
+    k_baf_deviation = np.sort(np.abs(p_binom[reshaped_pred_cnv, 0] - 0.5), axis=0)[
+        -MIN_BINS_ALL, :
+    ]
     # LOH states
-    for threshold in np.arange(MIN_BAF_DEVIATION_RANGE[0], MIN_BAF_DEVIATION_RANGE[1]-0.01, -0.02):
-        clones_hightumor = np.where( (k_baf_deviation >= threshold) & (clone_snpumi >= MIN_SNPUMI*n_obs) )[0]
+    for threshold in np.arange(
+        MIN_BAF_DEVIATION_RANGE[0], MIN_BAF_DEVIATION_RANGE[1] - 0.01, -0.02
+    ):
+        clones_hightumor = np.where(
+            (k_baf_deviation >= threshold) & (clone_snpumi >= MIN_SNPUMI * n_obs)
+        )[0]
         if len(clones_hightumor) == 0:
             continue
         if len(clones_hightumor) == n_clones:
             clones_hightumor = np.argsort(k_baf_deviation)[1:]
         # LOH states
-        loh_states = np.where( (np.abs(p_binom[:,0] - 0.5) > threshold) & (np.bincount(pred_cnv, minlength=n_states) >= MIN_BINS_PER_STATE) & (rdr_values <= MAX_RDR) )[0]
-        is_B_lost = (p_binom[loh_states,0] < 0.5)
-        if np.all([ np.sum(pd.Series(reshaped_pred_cnv[:,c]).isin(loh_states)) >= MIN_BINS_ALL for c in clones_hightumor ]):
+        loh_states = np.where(
+            (np.abs(p_binom[:, 0] - 0.5) > threshold)
+            & (np.bincount(pred_cnv, minlength=n_states) >= MIN_BINS_PER_STATE)
+            & (rdr_values <= MAX_RDR)
+        )[0]
+        is_B_lost = p_binom[loh_states, 0] < 0.5
+        if np.all(
+            [
+                np.sum(pd.Series(reshaped_pred_cnv[:, c]).isin(loh_states))
+                >= MIN_BINS_ALL
+                for c in clones_hightumor
+            ]
+        ):
             print(f"threshold = {threshold}")
             print(f"clones with high tumor proportion: {clones_hightumor}")
             print(f"BAF deviation threshold = {threshold}, LOH states: {loh_states}")
@@ -623,7 +820,18 @@ def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_c
     return loh_states, is_B_lost, rdr_values[loh_states], clones_hightumor
 
 
-def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_cnv, loh_states, is_B_lost, rdr_values, clone_to_consider, smooth_mat=None, MIN_TOTAL=10):
+def estimator_tumor_proportion(
+    single_X,
+    single_total_bb_RD,
+    assignments,
+    pred_cnv,
+    loh_states,
+    is_B_lost,
+    rdr_values,
+    clone_to_consider,
+    smooth_mat=None,
+    MIN_TOTAL=10,
+):
     """
     Attributes
     ----------
@@ -633,12 +841,12 @@ def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_c
     single_total_bb_RD : array, shape (n_obs, n_spots)
         Total allele count per bin per spot.
 
-    assignments : pd.DataFrame of size n_spots with columns "coarse", "combined" 
+    assignments : pd.DataFrame of size n_spots with columns "coarse", "combined"
         Clone assignment for each spot.
 
     pred_cnv : array, shape (n_obs * n_clones)
         Copy number states across bins for each clone.
-    
+
     loh_states, is_B_lost, rdr_values: array
         Copy number states and RDR values corresponding to LOH.
 
@@ -646,29 +854,40 @@ def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_c
     ----------
     0.5 ( 1-theta ) / (theta * RDR + 1 - theta) = B_count / Total_count for each LOH state.
     """
+
     # def estimate_purity(T_loh, B_loh, rdr_values):
     #     features =(T_loh / 2.0 + rdr_values * B_loh - B_loh)[T_loh>0].reshape(-1,1)
     #     y = (T_loh / 2.0 - B_loh)[T_loh>0]
     #     return np.linalg.lstsq(features, y, rcond=None)[0]
     def estimate_purity(T_loh, B_loh, rdr_values):
         idx = np.where(T_loh > 0)[0]
-        model = BAF_Binom(endog=B_loh[idx], exog=np.ones((len(idx),1)), weights=np.ones(len(idx)), exposure=T_loh[idx], offset=np.log(rdr_values[idx]), scaling=0.5)
+        model = BAF_Binom(
+            endog=B_loh[idx],
+            exog=np.ones((len(idx), 1)),
+            weights=np.ones(len(idx)),
+            exposure=T_loh[idx],
+            offset=np.log(rdr_values[idx]),
+            scaling=0.5,
+        )
         res = model.fit(disp=False)
         return 1.0 / (1.0 + np.exp(res.params))
+
     #
     n_obs = single_X.shape[0]
     n_spots = single_X.shape[2]
     n_clones = int(len(pred_cnv) / n_obs)
-    reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order='F')
+    reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order="F")
 
-    clone_mapping = assignments.groupby(['coarse', 'combined']).agg('first').reset_index()
+    clone_mapping = (
+        assignments.groupby(["coarse", "combined"]).agg("first").reset_index()
+    )
 
     tumor_proportion = np.zeros(n_spots)
     full_tumor_proportion = np.zeros((n_spots, n_clones))
     for i in range(n_spots):
         # get adjacent spots for smoothing
         if smooth_mat is not None:
-            idx_adj = smooth_mat[i,:].nonzero()[1]
+            idx_adj = smooth_mat[i, :].nonzero()[1]
         else:
             idx_adj = np.array([i])
         estimation_based_on_clones_single = np.ones(n_clones) * np.nan
@@ -677,26 +896,74 @@ def estimate_purity(T_loh, B_loh, rdr_values):
         summed_T_smoothed = np.ones(n_clones)
         for c in clone_to_consider:
             # single
-            B_loh = np.array([ np.sum(single_X[:,1,i][reshaped_pred_cnv[:,c]==s]) if is_B_lost[j] else np.sum(single_total_bb_RD[:,i][reshaped_pred_cnv[:,c]==s]) - np.sum(single_X[:,1,i][reshaped_pred_cnv[:,c]==s]) for j,s in enumerate(loh_states)])
-            T_loh = np.array([ np.sum(single_total_bb_RD[:,i][reshaped_pred_cnv[:,c]==s]) for s in loh_states])
+            B_loh = np.array(
+                [
+                    (
+                        np.sum(single_X[:, 1, i][reshaped_pred_cnv[:, c] == s])
+                        if is_B_lost[j]
+                        else np.sum(
+                            single_total_bb_RD[:, i][reshaped_pred_cnv[:, c] == s]
+                        )
+                        - np.sum(single_X[:, 1, i][reshaped_pred_cnv[:, c] == s])
+                    )
+                    for j, s in enumerate(loh_states)
+                ]
+            )
+            T_loh = np.array(
+                [
+                    np.sum(single_total_bb_RD[:, i][reshaped_pred_cnv[:, c] == s])
+                    for s in loh_states
+                ]
+            )
             if np.all(T_loh == 0):
                 continue
-            estimation_based_on_clones_single[c] = estimate_purity(T_loh, B_loh, rdr_values)
+            estimation_based_on_clones_single[c] = estimate_purity(
+                T_loh, B_loh, rdr_values
+            )
             summed_T_single[c] = np.sum(T_loh)
             # smoothed
-            B_loh = np.array([ np.sum(single_X[:,1,idx_adj][reshaped_pred_cnv[:,c]==s]) if is_B_lost[j] else np.sum(single_total_bb_RD[:,idx_adj][reshaped_pred_cnv[:,c]==s]) - np.sum(single_X[:,1,idx_adj][reshaped_pred_cnv[:,c]==s]) for j,s in enumerate(loh_states)])
-            T_loh = np.array([ np.sum(single_total_bb_RD[:,idx_adj][reshaped_pred_cnv[:,c]==s]) for s in loh_states])
+            B_loh = np.array(
+                [
+                    (
+                        np.sum(single_X[:, 1, idx_adj][reshaped_pred_cnv[:, c] == s])
+                        if is_B_lost[j]
+                        else np.sum(
+                            single_total_bb_RD[:, idx_adj][reshaped_pred_cnv[:, c] == s]
+                        )
+                        - np.sum(single_X[:, 1, idx_adj][reshaped_pred_cnv[:, c] == s])
+                    )
+                    for j, s in enumerate(loh_states)
+                ]
+            )
+            T_loh = np.array(
+                [
+                    np.sum(single_total_bb_RD[:, idx_adj][reshaped_pred_cnv[:, c] == s])
+                    for s in loh_states
+                ]
+            )
             if np.all(T_loh == 0):
                 continue
-            estimation_based_on_clones_smoothed[c] = estimate_purity(T_loh, B_loh, rdr_values)
+            estimation_based_on_clones_smoothed[c] = estimate_purity(
+                T_loh, B_loh, rdr_values
+            )
             summed_T_smoothed[c] = np.sum(T_loh)
-        full_tumor_proportion[i,:] = estimation_based_on_clones_single
-        if (assignments.combined.values[i] in clone_to_consider) and summed_T_single[assignments.combined.values[i]] >= MIN_TOTAL:
-            tumor_proportion[i] = estimation_based_on_clones_single[ assignments.combined.values[i] ]
-        elif (assignments.combined.values[i] in clone_to_consider) and summed_T_smoothed[assignments.combined.values[i]] >= MIN_TOTAL:
-            tumor_proportion[i] = estimation_based_on_clones_smoothed[ assignments.combined.values[i] ]
+        full_tumor_proportion[i, :] = estimation_based_on_clones_single
+        if (assignments.combined.values[i] in clone_to_consider) and summed_T_single[
+            assignments.combined.values[i]
+        ] >= MIN_TOTAL:
+            tumor_proportion[i] = estimation_based_on_clones_single[
+                assignments.combined.values[i]
+            ]
+        elif (
+            assignments.combined.values[i] in clone_to_consider
+        ) and summed_T_smoothed[assignments.combined.values[i]] >= MIN_TOTAL:
+            tumor_proportion[i] = estimation_based_on_clones_smoothed[
+                assignments.combined.values[i]
+            ]
         elif not assignments.combined.values[i] in clone_to_consider:
-            tumor_proportion[i] = estimation_based_on_clones_single[np.argmax(summed_T_single)]
+            tumor_proportion[i] = estimation_based_on_clones_single[
+                np.argmax(summed_T_single)
+            ]
         else:
             tumor_proportion[i] = np.nan
 
diff --git a/src/calicost/utils_phase_switch.py b/src/calicost/utils_phase_switch.py
index aed6e11..2b30fa3 100644
--- a/src/calicost/utils_phase_switch.py
+++ b/src/calicost/utils_phase_switch.py
@@ -15,7 +15,7 @@ def get_position_cM_table(chr_pos_vector, geneticmap_file):
     """
     df = pd.read_csv(geneticmap_file, header=0, sep="\t")
     # remove chrX
-    df = df[df.chrom.isin( [f"chr{i}" for i in range(1,23)] )]
+    df = df[df.chrom.isin([f"chr{i}" for i in range(1, 23)])]
     # check the chromosome names
     if not ("chr" in str(chr_pos_vector[0][0])):
         df["chrom"] = [int(x[3:]) for x in df.chrom]
@@ -28,22 +28,28 @@ def get_position_cM_table(chr_pos_vector, geneticmap_file):
     # find the centimorgan values (interpolate between (k-1)-th and k-th rows in centimorgan tables)
     position_cM = np.ones(len(chr_pos_vector)) * np.nan
     k = 0
-    for i,x in enumerate(chr_pos_vector):
+    for i, x in enumerate(chr_pos_vector):
         chrname = x[0]
         pos = x[1]
-        while k < len(ref_chrom) and (ref_chrom[k] < chrname or (ref_chrom[k] == chrname and ref_pos[k] < pos)):
+        while k < len(ref_chrom) and (
+            ref_chrom[k] < chrname or (ref_chrom[k] == chrname and ref_pos[k] < pos)
+        ):
             k += 1
         if k < len(ref_chrom) and ref_chrom[k] == chrname and ref_pos[k] >= pos:
-            if k > 0 and ref_chrom[k-1] == chrname:
-                position_cM[i] = ref_cm[k-1] + (pos - ref_pos[k-1]) / (ref_pos[k] - ref_pos[k-1]) * (ref_cm[k] - ref_cm[k-1])
+            if k > 0 and ref_chrom[k - 1] == chrname:
+                position_cM[i] = ref_cm[k - 1] + (pos - ref_pos[k - 1]) / (
+                    ref_pos[k] - ref_pos[k - 1]
+                ) * (ref_cm[k] - ref_cm[k - 1])
             else:
                 position_cM[i] = (pos - 0) / (ref_pos[k] - 0) * (ref_cm[k] - 0)
         else:
-            position_cM[i] = ref_cm[k-1]
+            position_cM[i] = ref_cm[k - 1]
     return position_cM
 
 
-def compute_phase_switch_probability_position(position_cM, chr_pos_vector, nu = 1, min_prob=1e-20):
+def compute_phase_switch_probability_position(
+    position_cM, chr_pos_vector, nu=1, min_prob=1e-20
+):
     """
     Attributes
     ----------
@@ -54,9 +60,13 @@ def compute_phase_switch_probability_position(position_cM, chr_pos_vector, nu =
         list of (chr, pos) pairs of SNPs. It is used to identify start of a new chr.
     """
     phase_switch_prob = np.ones(len(position_cM)) * 1e-20
-    for i,cm in enumerate(position_cM[:-1]):
-        cm_next = position_cM[i+1]
-        if np.isnan(cm) or np.isnan(cm_next) or chr_pos_vector[i][0] != chr_pos_vector[i+1][0]:
+    for i, cm in enumerate(position_cM[:-1]):
+        cm_next = position_cM[i + 1]
+        if (
+            np.isnan(cm)
+            or np.isnan(cm_next)
+            or chr_pos_vector[i][0] != chr_pos_vector[i + 1][0]
+        ):
             continue
         assert cm <= cm_next
         d = cm_next - cm
@@ -70,25 +80,42 @@ def duplicate_RD(chr_baf, pos_baf, chr_rd, start_rd, end_rd, tumor_rd, normal_rd
     normal_reads = np.ones(len(chr_baf)) * np.nan
     idx = 0
     for i in range(len(chr_baf)):
-        while idx < len(chr_rd) and (chr_rd[idx] < chr_baf[i] or (chr_rd[idx] == chr_baf[i] and end_rd[idx] < pos_baf[i])):
+        while idx < len(chr_rd) and (
+            chr_rd[idx] < chr_baf[i]
+            or (chr_rd[idx] == chr_baf[i] and end_rd[idx] < pos_baf[i])
+        ):
             idx += 1
-        if idx < len(chr_rd) and chr_rd[idx] == chr_baf[i] and end_rd[idx] >= pos_baf[i] and start_rd[idx] <= pos_baf[i]:
+        if (
+            idx < len(chr_rd)
+            and chr_rd[idx] == chr_baf[i]
+            and end_rd[idx] >= pos_baf[i]
+            and start_rd[idx] <= pos_baf[i]
+        ):
             tumor_reads[i] = tumor_rd[idx]
             normal_reads[i] = normal_rd[idx]
     return tumor_reads, normal_reads
 
 
-def generate_input_from_HATCHet(hatchetdir, output_picklefile, rdrfile="abin/bulk.bb", baffile="baf/bulk.1bed", phasefile="phase/phased.vcf.gz", with_chr_prefix=True):
+def generate_input_from_HATCHet(
+    hatchetdir,
+    output_picklefile,
+    rdrfile="abin/bulk.bb",
+    baffile="baf/bulk.1bed",
+    phasefile="phase/phased.vcf.gz",
+    with_chr_prefix=True,
+):
     if with_chr_prefix:
         unique_chrs = [f"chr{i}" for i in range(1, 23)]
     else:
         unique_chrs = np.arange(1, 23)
-    
+
     ### load hatchet outputs ###
     if Path(output_picklefile).exists():
         # RDR file
         df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t")
-        df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True)
+        df_all.iloc[:, 0] = pd.Categorical(
+            df_all.iloc[:, 0], categories=unique_chrs, ordered=True
+        )
         df_all.sort_values(by=["#CHR", "START"], inplace=True)
         # samples
         unique_samples = np.unique(df_all["SAMPLE"])
@@ -97,29 +124,61 @@ def generate_input_from_HATCHet(hatchetdir, output_picklefile, rdrfile="abin/bul
     else:
         # RDR file
         df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t")
-        df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True)
+        df_all.iloc[:, 0] = pd.Categorical(
+            df_all.iloc[:, 0], categories=unique_chrs, ordered=True
+        )
         df_all.sort_values(by=["#CHR", "START"], inplace=True)
         # samples
         unique_samples = np.unique(df_all["SAMPLE"])
+
         # allele counts for individual SNPs
         def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples):
-            tmpdf = pd.read_csv(f"{hatchetdir}/{baffile}", header=None, sep="\t", names=["CHR", "POS", "SAMPLE", "REF", "ALT"])
+            tmpdf = pd.read_csv(
+                f"{hatchetdir}/{baffile}",
+                header=None,
+                sep="\t",
+                names=["CHR", "POS", "SAMPLE", "REF", "ALT"],
+            )
             df_baf = []
             for chrname in unique_chrs:
                 tmp = tmpdf[tmpdf.CHR == chrname]
-                list_pos = [set(list(tmp[tmp["SAMPLE"] == s].POS)) for s in unique_samples] # SNP set of each individual sample
-                shared_pos = set.intersection(*list_pos) # SNPs that are shared across samples
-                index = np.array([i for i in range(tmp.shape[0]) if tmp.iloc[i,1] in shared_pos])
-                tmp = tmp.iloc[index,:]
+                list_pos = [
+                    set(list(tmp[tmp["SAMPLE"] == s].POS)) for s in unique_samples
+                ]  # SNP set of each individual sample
+                shared_pos = set.intersection(
+                    *list_pos
+                )  # SNPs that are shared across samples
+                index = np.array(
+                    [i for i in range(tmp.shape[0]) if tmp.iloc[i, 1] in shared_pos]
+                )
+                tmp = tmp.iloc[index, :]
                 tmp.sort_values(by=["POS", "SAMPLE"], inplace=True)
-                df_baf.append( tmp )
+                df_baf.append(tmp)
             df_baf = pd.concat(df_baf, ignore_index=True)
             return df_baf
+
         df_baf = load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples)
         # reference-based phasing results
-        df_phase = pd.read_csv(f"{hatchetdir}/{phasefile}", comment="#", sep="\t", \
-                        names=["CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLENAME"])
-        df_phase = df_phase[(df_phase.SAMPLENAME=="0|1") | (df_phase.SAMPLENAME=="1|0")]
+        df_phase = pd.read_csv(
+            f"{hatchetdir}/{phasefile}",
+            comment="#",
+            sep="\t",
+            names=[
+                "CHR",
+                "POS",
+                "ID",
+                "REF",
+                "ALT",
+                "QUAL",
+                "FILTER",
+                "INFO",
+                "FORMAT",
+                "SAMPLENAME",
+            ],
+        )
+        df_phase = df_phase[
+            (df_phase.SAMPLENAME == "0|1") | (df_phase.SAMPLENAME == "1|0")
+        ]
         print("HATCHet dataframes loaded.")
 
         ### gather phased BAF info ###
@@ -127,13 +186,17 @@ def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples):
         for chrname in unique_chrs:
             tmpdf_baf = df_baf[df_baf.CHR == chrname]
             tmpdf_phase = df_phase[df_phase.CHR == chrname][["POS", "SAMPLENAME"]]
-            tmpdf_baf = tmpdf_baf.join( tmpdf_phase.set_index("POS"), on="POS")
+            tmpdf_baf = tmpdf_baf.join(tmpdf_phase.set_index("POS"), on="POS")
             tmpdf_baf = tmpdf_baf[~tmpdf_baf.SAMPLENAME.isnull()]
-            tmpdf_baf["B_count"] = np.where(tmpdf_baf.SAMPLENAME=="0|1", tmpdf_baf.REF, tmpdf_baf.ALT)
+            tmpdf_baf["B_count"] = np.where(
+                tmpdf_baf.SAMPLENAME == "0|1", tmpdf_baf.REF, tmpdf_baf.ALT
+            )
             tmpdf_baf["DP"] = tmpdf_baf.REF + tmpdf_baf.ALT
-            df_combined_baf.append( tmpdf_baf )
+            df_combined_baf.append(tmpdf_baf)
         df_combined_baf = pd.concat(df_combined_baf, ignore_index=True)
-        df_combined_baf.iloc[:,0] = pd.Categorical(df_combined_baf.CHR, categories=unique_chrs, ordered=True)
+        df_combined_baf.iloc[:, 0] = pd.Categorical(
+            df_combined_baf.CHR, categories=unique_chrs, ordered=True
+        )
         df_combined_baf.sort_values(by=["CHR", "POS"], inplace=True)
         df_baf = df_combined_baf
 
@@ -143,51 +206,81 @@ def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples):
         for s in unique_samples:
             index = np.where(df_baf["SAMPLE"] == s)[0]
             index_rd = np.where(df_all["SAMPLE"] == s)[0]
-            tumor_reads, normal_reads = duplicate_RD(np.array(df_baf.iloc[index,:].CHR.cat.codes), np.array(df_baf.iloc[index,:].POS), \
-                                                    np.array(df_all.iloc[index_rd,0].cat.codes), np.array(df_all.iloc[index_rd,:].START), np.array(df_all.iloc[index_rd,:].END), \
-                                                    np.array(df_all.iloc[index_rd,:].TOTAL_READS), np.array(df_all.iloc[index_rd,:].NORMAL_READS))
+            tumor_reads, normal_reads = duplicate_RD(
+                np.array(df_baf.iloc[index, :].CHR.cat.codes),
+                np.array(df_baf.iloc[index, :].POS),
+                np.array(df_all.iloc[index_rd, 0].cat.codes),
+                np.array(df_all.iloc[index_rd, :].START),
+                np.array(df_all.iloc[index_rd, :].END),
+                np.array(df_all.iloc[index_rd, :].TOTAL_READS),
+                np.array(df_all.iloc[index_rd, :].NORMAL_READS),
+            )
             df_baf.iloc[index, -2] = tumor_reads
             df_baf.iloc[index, -1] = normal_reads
+
         # remove SNP positions with TOTAL_READS=NAN (if NAN occurs in one sample, remove the corresponding SNPs for the other samples too)
         def remove_nan_RD(df_baf):
-            idx_nan = np.where(np.logical_or( df_baf.TOTAL_READS.isnull(), df_baf.NORMAL_READS.isnull() ))[0]
+            idx_nan = np.where(
+                np.logical_or(df_baf.TOTAL_READS.isnull(), df_baf.NORMAL_READS.isnull())
+            )[0]
             chr = np.array(df_baf.CHR)
             pos = np.array(df_baf.POS)
             chr_pos = np.array([f"{chr[i]}_{pos[i]}" for i in range(len(chr))])
             nan_chr_pos = set(list(chr_pos[idx_nan]))
-            idx_remain = np.array([i for i,snpid in enumerate(chr_pos) if not (snpid in nan_chr_pos)])
+            idx_remain = np.array(
+                [i for i, snpid in enumerate(chr_pos) if not (snpid in nan_chr_pos)]
+            )
             df_baf = df_baf.iloc[idx_remain, :]
             return df_baf
+
         df_baf = remove_nan_RD(df_baf)
         df_baf.to_pickle(output_picklefile)
         print("SNP-level BAF and bin-level RDR paired up.")
 
     ### from BAF, RDR table, generate HMM input ###
-    lengths = np.array([ np.sum(np.logical_and(df_baf["CHR"]==chrname, df_baf["SAMPLE"]==unique_samples[0])) for chrname in unique_chrs ])
+    lengths = np.array(
+        [
+            np.sum(
+                np.logical_and(
+                    df_baf["CHR"] == chrname, df_baf["SAMPLE"] == unique_samples[0]
+                )
+            )
+            for chrname in unique_chrs
+        ]
+    )
 
-    X = np.zeros(( np.sum(lengths), 2, len(unique_samples) ))
-    base_nb_mean = np.zeros((np.sum(lengths), len(unique_samples) ))
-    total_bb_RD = np.zeros((np.sum(lengths), len(unique_samples) ))
+    X = np.zeros((np.sum(lengths), 2, len(unique_samples)))
+    base_nb_mean = np.zeros((np.sum(lengths), len(unique_samples)))
+    total_bb_RD = np.zeros((np.sum(lengths), len(unique_samples)))
 
-    for k,s in enumerate(unique_samples):
+    for k, s in enumerate(unique_samples):
         df = df_baf[df_baf["SAMPLE"] == s]
-        X[:,0,k] = df.TOTAL_READS
-        X[:,1,k] = df.B_count
+        X[:, 0, k] = df.TOTAL_READS
+        X[:, 1, k] = df.B_count
 
-        total_bb_RD[:,k] = np.array(df.DP)
+        total_bb_RD[:, k] = np.array(df.DP)
         df2 = df_all[df_all["SAMPLE"] == s]
-        base_nb_mean[:,k] = np.array(df.NORMAL_READS / np.sum(df2.NORMAL_READS) * np.sum(df2.TOTAL_READS))
+        base_nb_mean[:, k] = np.array(
+            df.NORMAL_READS / np.sum(df2.NORMAL_READS) * np.sum(df2.TOTAL_READS)
+        )
 
     # site-wise transition matrix
-    chr_pos_vector = [(df_baf.CHR.iloc[i], df_baf.POS.iloc[i]) for i in np.where(df_baf["SAMPLE"]==unique_samples[0])[0]]
+    chr_pos_vector = [
+        (df_baf.CHR.iloc[i], df_baf.POS.iloc[i])
+        for i in np.where(df_baf["SAMPLE"] == unique_samples[0])[0]
+    ]
     position_cM = get_position_cM_table(chr_pos_vector)
-    phase_switch_prob = compute_phase_switch_probability_position(position_cM, chr_pos_vector)
+    phase_switch_prob = compute_phase_switch_probability_position(
+        position_cM, chr_pos_vector
+    )
     log_sitewise_transmat = np.log(phase_switch_prob)
 
     return X, lengths, base_nb_mean, total_bb_RD, log_sitewise_transmat
 
 
-def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, clone_pred2, p_binom2):
+def distance_between_p_binom(
+    state_pred1, clone_pred1, p_binom1, state_pred2, clone_pred2, p_binom2
+):
     import networkx as nx
 
     # matching predicted CNV states
@@ -201,7 +294,22 @@ def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, cl
     # tmp = nx.max_weight_matching(G)
     # state_matching = {x[0]:x[1] for x in tmp}
     # state_matching.update( {x[1]:x[0] for x in tmp} )
-    G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(state_pred1) - np.sum(np.logical_and(state_pred1==uniq_pred1[i], state_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] )
+    G.add_weighted_edges_from(
+        [
+            (
+                f"A{i}",
+                f"B{j}",
+                len(state_pred1)
+                - np.sum(
+                    np.logical_and(
+                        state_pred1 == uniq_pred1[i], state_pred2 == uniq_pred2[j]
+                    )
+                ),
+            )
+            for i in uniq_pred1
+            for j in uniq_pred2
+        ]
+    )
     state_matching = nx.bipartite.minimum_weight_full_matching(G)
 
     # matching predicted clones
@@ -215,16 +323,38 @@ def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, cl
     # tmp = nx.max_weight_matching(G)
     # clone_matching = {x[0]:x[1] for x in tmp}
     # clone_matching.update( {x[1]:x[0] for x in tmp} )
-    G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(clone_pred1) - np.sum(np.logical_and(clone_pred1==uniq_pred1[i], clone_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] )
+    G.add_weighted_edges_from(
+        [
+            (
+                f"A{i}",
+                f"B{j}",
+                len(clone_pred1)
+                - np.sum(
+                    np.logical_and(
+                        clone_pred1 == uniq_pred1[i], clone_pred2 == uniq_pred2[j]
+                    )
+                ),
+            )
+            for i in uniq_pred1
+            for j in uniq_pred2
+        ]
+    )
     clone_matching = nx.bipartite.minimum_weight_full_matching(G)
 
     # l2 distance between corresponding CNV at corresponding clone
     # reorder p_binom2 based on state_matching and clone_matching
-    reorder_p_binom2 = p_binom2[:, np.array([ int(clone_matching[f"A{i}"][1:]) for i in range(n_clones)])]
-    reorder_p_binom2 = reorder_p_binom2[np.array([ int(state_matching[f"A{i}"][1:]) for i in range(n_states) ]), :]
+    reorder_p_binom2 = p_binom2[
+        :, np.array([int(clone_matching[f"A{i}"][1:]) for i in range(n_clones)])
+    ]
+    reorder_p_binom2 = reorder_p_binom2[
+        np.array([int(state_matching[f"A{i}"][1:]) for i in range(n_states)]), :
+    ]
     l2 = 0
     for i in range(p_binom1.shape[0]):
-        l2 += min( np.sum(np.square(p_binom1[i,:] - reorder_p_binom2[i,:])), np.sum(np.square(p_binom1[i,:] - 1 + reorder_p_binom2[i,:])) )
+        l2 += min(
+            np.sum(np.square(p_binom1[i, :] - reorder_p_binom2[i, :])),
+            np.sum(np.square(p_binom1[i, :] - 1 + reorder_p_binom2[i, :])),
+        )
     return l2
 
 
@@ -235,14 +365,14 @@ def get_intervals(pred_cnv):
     while s < len(pred_cnv):
         t = np.where(pred_cnv[s:] != pred_cnv[s])[0]
         if len(t) == 0:
-            intervals.append( (s, len(pred_cnv))  )
-            labs.append( pred_cnv[s] )
+            intervals.append((s, len(pred_cnv)))
+            labs.append(pred_cnv[s])
             s = len(pred_cnv)
         else:
             t = t[0]
-            intervals.append( (s,s+t) )
-            labs.append( pred_cnv[s] )
-            s = s+t
+            intervals.append((s, s + t))
+            labs.append(pred_cnv[s])
+            s = s + t
     return intervals, labs
 
 
@@ -256,14 +386,14 @@ def get_intervals_nd(pred_cnv):
     while s < len(pred_cnv):
         t = np.where(np.any(pred_cnv[s:] != pred_cnv[s], axis=1))[0]
         if len(t) == 0:
-            intervals.append( (s, len(pred_cnv))  )
-            labs.append( pred_cnv[s] )
+            intervals.append((s, len(pred_cnv)))
+            labs.append(pred_cnv[s])
             s = len(pred_cnv)
         else:
             t = t[0]
-            intervals.append( (s,s+t) )
-            labs.append( pred_cnv[s] )
-            s = s+t
+            intervals.append((s, s + t))
+            labs.append(pred_cnv[s])
+            s = s + t
     return intervals, labs
 
 
@@ -276,14 +406,14 @@ def postbinning_forvisual(X, base_nb_mean, total_bb_RD, lengths, res, binsize=2)
     nextlen = lengths[chrname]
     s = 0
     while s < X.shape[0]:
-        t = min(s+binsize, nextlen)
-        intervals.append( [s,t] )
+        t = min(s + binsize, nextlen)
+        intervals.append([s, t])
         s = t
         if s >= nextlen:
             if s < X.shape[0]:
                 chrname += 1
                 nextlen += lengths[chrname]
-            bin_lengths.append( len(intervals) )
+            bin_lengths.append(len(intervals))
     bin_lengths = np.array(bin_lengths)
     bin_lengths[1:] = bin_lengths[1:] - bin_lengths[:-1]
 
@@ -295,11 +425,21 @@ def postbinning_forvisual(X, base_nb_mean, total_bb_RD, lengths, res, binsize=2)
     bin_total_bb_RD = np.zeros((len(intervals), total_bb_RD.shape[1]), dtype=int)
     bin_pred_cnv = np.zeros(len(intervals), dtype=int)
     for i, intvl in enumerate(intervals):
-        s,t = intvl
-        bin_X[i,0,:] = np.sum(X[s:t, 0,:], axis=0)
-        bin_X[i,1,:] = np.sum( phase_prob[s:t].dot(X[s:t, 1,:]) + (1-phase_prob[s:t]).dot(total_bb_RD[s:t,:] - X[s:t,1,:]) )
-        bin_base_nb_mean[i,:] = np.sum(base_nb_mean[s:t,:], axis=0)
-        bin_total_bb_RD[i,:] = np.sum(total_bb_RD[s:t,:], axis=0)
+        s, t = intvl
+        bin_X[i, 0, :] = np.sum(X[s:t, 0, :], axis=0)
+        bin_X[i, 1, :] = np.sum(
+            phase_prob[s:t].dot(X[s:t, 1, :])
+            + (1 - phase_prob[s:t]).dot(total_bb_RD[s:t, :] - X[s:t, 1, :])
+        )
+        bin_base_nb_mean[i, :] = np.sum(base_nb_mean[s:t, :], axis=0)
+        bin_total_bb_RD[i, :] = np.sum(total_bb_RD[s:t, :], axis=0)
         bin_pred_cnv[i] = res["pred_cnv"][s]
-    
-    return bin_X, bin_base_nb_mean, bin_total_bb_RD, bin_pred_cnv, bin_lengths, intervals
\ No newline at end of file
+
+    return (
+        bin_X,
+        bin_base_nb_mean,
+        bin_total_bb_RD,
+        bin_pred_cnv,
+        bin_lengths,
+        intervals,
+    )
diff --git a/src/calicost/utils_plotting.py b/src/calicost/utils_plotting.py
index 079278a..4544e76 100644
--- a/src/calicost/utils_plotting.py
+++ b/src/calicost/utils_plotting.py
@@ -1,4 +1,3 @@
-
 import sys
 import argparse
 
@@ -22,409 +21,1098 @@
 
 def get_full_palette():
     palette = {}
-    palette.update({(0, 0) : 'darkblue'})
-    palette.update({(1, 0) : 'lightblue'})
-    palette.update({(1, 1) : 'lightgray', (2, 0) : 'dimgray'})
-    palette.update({(2, 1) : 'lightgoldenrodyellow', (3, 0) : 'gold'})
+    palette.update({(0, 0): "darkblue"})
+    palette.update({(1, 0): "lightblue"})
+    palette.update({(1, 1): "lightgray", (2, 0): "dimgray"})
+    palette.update({(2, 1): "lightgoldenrodyellow", (3, 0): "gold"})
     # palette.update({(2, 1) : 'greenyellow', (3, 0) : 'darkseagreen'})
-    palette.update({(2, 2) : 'navajowhite', (3, 1) : 'orange', (4, 0) : 'darkorange'})
-    palette.update({(3, 2) : 'salmon', (4, 1) : 'red', (5, 0) : 'darkred'})
-    palette.update({(3, 3) : 'plum', (4, 2) : 'orchid', (5, 1) : 'purple', (6, 0) : 'indigo'})
-    ordered_acn = [(0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), \
-                   (2, 2), (3, 1), (4, 0), (3, 2), (4, 1), (5, 0), \
-                   (3, 3), (4, 2), (5, 1), (6, 0)]
+    palette.update({(2, 2): "navajowhite", (3, 1): "orange", (4, 0): "darkorange"})
+    palette.update({(3, 2): "salmon", (4, 1): "red", (5, 0): "darkred"})
+    palette.update(
+        {(3, 3): "plum", (4, 2): "orchid", (5, 1): "purple", (6, 0): "indigo"}
+    )
+    ordered_acn = [
+        (0, 0),
+        (1, 0),
+        (1, 1),
+        (2, 0),
+        (2, 1),
+        (3, 0),
+        (2, 2),
+        (3, 1),
+        (4, 0),
+        (3, 2),
+        (4, 1),
+        (5, 0),
+        (3, 3),
+        (4, 2),
+        (5, 1),
+        (6, 0),
+    ]
     return palette, ordered_acn
 
 
-def plot_acn(cn_file, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, add_arrow=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True):
+def plot_acn(
+    cn_file,
+    ax_handle,
+    clone_ids=None,
+    clone_names=None,
+    add_chrbar=True,
+    add_arrow=True,
+    chrbar_thickness=0.1,
+    add_legend=True,
+    remove_xticks=True,
+):
     # full color palette
-    palette,_ = get_full_palette()
+    palette, _ = get_full_palette()
 
     # read CN profiles
     df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
     print(final_clone_ids)
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
 
     found = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
         found += list(zip(major, minor))
     found = list(set(found))
     found.sort()
 
     # map CN to single digit number
-    map_cn = {x:i for i,x in enumerate(found)}
+    map_cn = {x: i for i, x in enumerate(found)}
     cnv_mapped = []
     ploidy = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] )
-        ploidy.append( np.mean(major + minor) )
-    cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids])
-    ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids])
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))])
+        ploidy.append(np.mean(major + minor))
+    cnv_mapped = pd.DataFrame(
+        np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]
+    )
+    ploidy = pd.DataFrame(
+        np.around(np.array(ploidy), decimals=2).reshape(-1, 1),
+        index=[f"clone {cid}" for cid in final_clone_ids],
+    )
     chr_ids = df_cnv.CHR
 
     colors = [palette[c] for c in found]
     if clone_ids is None:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids]
-        rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)])
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+        rename_cnv_mapped = pd.DataFrame(
+            cnv_mapped.values,
+            index=[
+                f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                for c, cid in enumerate(final_clone_ids)
+            ],
+        )
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
     else:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids]
         if clone_names is None:
-            rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)])
+            rename_cnv_mapped = pd.DataFrame(
+                cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[
+                    f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                    for c, cid in enumerate(clone_ids)
+                ],
+            )
         else:
-            rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)])
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+            rename_cnv_mapped = pd.DataFrame(
+                cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[
+                    f"{clone_names[c]}\nploidy {tmp_ploidy[c]}"
+                    for c, cid in enumerate(clone_ids)
+                ],
+            )
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
 
     # indicate allele switches
     if add_arrow:
         if clone_ids is None:
             # find regions where there exist both clones with A > B and clones with A < B
-            has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0)
-            has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0)
-            intervals, labs = get_intervals( (has_up & has_down) )
+            has_up = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values
+                        for cid in final_clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            has_down = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values
+                        for cid in final_clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            intervals, labs = get_intervals((has_up & has_down))
             # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow
             for i in range(len(intervals)):
                 if not labs[i]:
                     continue
-                for c,cid in enumerate(final_clone_ids):
+                for c, cid in enumerate(final_clone_ids):
                     y1 = c
-                    y2 = c+1
+                    y2 = c + 1
                     # up-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        > df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y2 + 0.1 * y1,
+                                dx=0,
+                                dy=0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
                     # down-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        < df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1]-sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y1 + 0.1 * y2,
+                                dx=0,
+                                dy=-0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
         else:
             # find regions where there exist both clones with A > B and clones with A < B
-            has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0)
-            has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0)
-            intervals, labs = get_intervals( (has_up & has_down) )
+            has_up = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values
+                        for cid in clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            has_down = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values
+                        for cid in clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            intervals, labs = get_intervals((has_up & has_down))
             # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow
             for i in range(len(intervals)):
                 if not labs[i]:
                     continue
-                for c,cid in enumerate(clone_ids):
+                for c, cid in enumerate(clone_ids):
                     y1 = c
-                    y2 = c+1
+                    y2 = c + 1
                     # up-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        > df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y2 + 0.1 * y1,
+                                dx=0,
+                                dy=0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
                     # down-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        < df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y1 + 0.1 * y2,
+                                dx=0,
+                                dy=-0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
 
     if add_chrbar:
         # add chr color
-        chr_palette = cycle(['#525252', '#969696', '#cccccc'])
-        lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)}
+        chr_palette = cycle(["#525252", "#969696", "#cccccc"])
+        lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)}
         col_colors = chr_ids.map(lut)
         for i, color in enumerate(col_colors):
-            ax_handle.add_patch(plt.Rectangle(xy=(i, 1.01), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True))
+            ax_handle.add_patch(
+                plt.Rectangle(
+                    xy=(i, 1.01),
+                    width=1,
+                    height=chrbar_thickness,
+                    color=color,
+                    lw=0,
+                    transform=ax_handle.get_xaxis_transform(),
+                    clip_on=False,
+                    rasterized=True,
+                )
+            )
 
         for c in np.unique(chr_ids.values):
             interval = np.where(chr_ids.values == c)[0]
             mid = np.percentile(interval, 45)
-            ax_handle.text(mid-10, 1.04, str(c), transform=ax_handle.get_xaxis_transform())
+            ax_handle.text(
+                mid - 10, 1.04, str(c), transform=ax_handle.get_xaxis_transform()
+            )
 
     ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0)
     if remove_xticks:
         ax_handle.set_xticks([])
 
     if add_legend:
-        a00 = plt.arrow(0,0, 0,0, color='darkblue')
-        a10 = plt.arrow(0,0, 0,0, color='lightblue')
-        a11 = plt.arrow(0,0, 0,0, color='lightgray')
-        a20 = plt.arrow(0,0, 0,0, color='dimgray')
-        a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow')
-        a30 = plt.arrow(0,0, 0,0, color='gold')
-        a22 = plt.arrow(0,0, 0,0, color='navajowhite')
-        a31 = plt.arrow(0,0, 0,0, color='orange')
-        a40 = plt.arrow(0,0, 0,0, color='darkorange')
-        a32 = plt.arrow(0,0, 0,0, color='salmon')
-        a41 = plt.arrow(0,0, 0,0, color='red')
-        a50 = plt.arrow(0,0, 0,0, color='darkred')
-        a33 = plt.arrow(0,0, 0,0, color='plum')
-        a42 = plt.arrow(0,0, 0,0, color='orchid')
-        a51 = plt.arrow(0,0, 0,0, color='purple')
-        a60 = plt.arrow(0,0, 0,0, color='indigo')
-        ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \
-        ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \
-        '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1))
+        a00 = plt.arrow(0, 0, 0, 0, color="darkblue")
+        a10 = plt.arrow(0, 0, 0, 0, color="lightblue")
+        a11 = plt.arrow(0, 0, 0, 0, color="lightgray")
+        a20 = plt.arrow(0, 0, 0, 0, color="dimgray")
+        a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow")
+        a30 = plt.arrow(0, 0, 0, 0, color="gold")
+        a22 = plt.arrow(0, 0, 0, 0, color="navajowhite")
+        a31 = plt.arrow(0, 0, 0, 0, color="orange")
+        a40 = plt.arrow(0, 0, 0, 0, color="darkorange")
+        a32 = plt.arrow(0, 0, 0, 0, color="salmon")
+        a41 = plt.arrow(0, 0, 0, 0, color="red")
+        a50 = plt.arrow(0, 0, 0, 0, color="darkred")
+        a33 = plt.arrow(0, 0, 0, 0, color="plum")
+        a42 = plt.arrow(0, 0, 0, 0, color="orchid")
+        a51 = plt.arrow(0, 0, 0, 0, color="purple")
+        a60 = plt.arrow(0, 0, 0, 0, color="indigo")
+        ax_handle.legend(
+            [
+                a00,
+                a10,
+                a11,
+                a20,
+                a21,
+                a30,
+                a22,
+                a31,
+                a40,
+                a32,
+                a41,
+                a50,
+                a33,
+                a42,
+                a51,
+                a60,
+            ],
+            [
+                "(0, 0)",
+                "(1, 0)",
+                "(1, 1)",
+                "(2, 0)",
+                "(2, 1)",
+                "(3, 0)",
+                "(2, 2)",
+                "(3, 1)",
+                "(4, 0)",
+                "(3, 2)",
+                "(4, 1)",
+                "(5, 0)",
+                "(3, 3)",
+                "(4, 2)",
+                "(5, 1)",
+                "(6, 0)",
+            ],
+            ncol=2,
+            loc="upper left",
+            bbox_to_anchor=(1, 1),
+        )
     return ax_handle
 
 
-def plot_acn_from_df(df_cnv, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, add_arrow=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True, rasterized=True):
+def plot_acn_from_df(
+    df_cnv,
+    ax_handle,
+    clone_ids=None,
+    clone_names=None,
+    add_chrbar=True,
+    add_arrow=True,
+    chrbar_thickness=0.1,
+    add_legend=True,
+    remove_xticks=True,
+    rasterized=True,
+):
     # full color palette
-    palette,_ = get_full_palette()
+    palette, _ = get_full_palette()
 
     # read CN profiles
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
     print(final_clone_ids)
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
 
     found = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
         found += list(zip(major, minor))
     found = list(set(found))
     found.sort()
 
     # map CN to single digit number
-    map_cn = {x:i for i,x in enumerate(found)}
+    map_cn = {x: i for i, x in enumerate(found)}
     cnv_mapped = []
     ploidy = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] )
-        ploidy.append( np.mean(major + minor) )
-    cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids])
-    ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids])
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))])
+        ploidy.append(np.mean(major + minor))
+    cnv_mapped = pd.DataFrame(
+        np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]
+    )
+    ploidy = pd.DataFrame(
+        np.around(np.array(ploidy), decimals=2).reshape(-1, 1),
+        index=[f"clone {cid}" for cid in final_clone_ids],
+    )
     chr_ids = df_cnv.CHR
 
     colors = [palette[c] for c in found]
     if clone_ids is None:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids]
-        rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)])
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle)
+        rename_cnv_mapped = pd.DataFrame(
+            cnv_mapped.values,
+            index=[
+                f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                for c, cid in enumerate(final_clone_ids)
+            ],
+        )
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=rasterized,
+            ax=ax_handle,
+        )
     else:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids]
         if clone_names is None:
-            rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)])
+            rename_cnv_mapped = pd.DataFrame(
+                cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[
+                    f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                    for c, cid in enumerate(clone_ids)
+                ],
+            )
         else:
-            rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)])
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle)
+            rename_cnv_mapped = pd.DataFrame(
+                cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[
+                    f"{clone_names[c]}\nploidy {tmp_ploidy[c]}"
+                    for c, cid in enumerate(clone_ids)
+                ],
+            )
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=rasterized,
+            ax=ax_handle,
+        )
 
     # indicate allele switches
     if add_arrow:
         if clone_ids is None:
             # find regions where there exist both clones with A > B and clones with A < B
-            has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0)
-            has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0)
-            intervals, labs = get_intervals( (has_up & has_down) )
+            has_up = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values
+                        for cid in final_clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            has_down = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values
+                        for cid in final_clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            intervals, labs = get_intervals((has_up & has_down))
             # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow
             for i in range(len(intervals)):
                 if not labs[i]:
                     continue
-                for c,cid in enumerate(final_clone_ids):
+                for c, cid in enumerate(final_clone_ids):
                     y1 = c
-                    y2 = c+1
+                    y2 = c + 1
                     # up-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        > df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y2 + 0.1 * y1,
+                                dx=0,
+                                dy=0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
                     # down-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        < df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1]-sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y1 + 0.1 * y2,
+                                dx=0,
+                                dy=-0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
         else:
             # find regions where there exist both clones with A > B and clones with A < B
-            has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0)
-            has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0)
-            intervals, labs = get_intervals( (has_up & has_down) )
+            has_up = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values
+                        for cid in clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            has_down = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values
+                        for cid in clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            intervals, labs = get_intervals((has_up & has_down))
             # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow
             for i in range(len(intervals)):
                 if not labs[i]:
                     continue
-                for c,cid in enumerate(clone_ids):
+                for c, cid in enumerate(clone_ids):
                     y1 = c
-                    y2 = c+1
+                    y2 = c + 1
                     # up-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        > df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y2 + 0.1 * y1,
+                                dx=0,
+                                dy=0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
                     # down-arrow
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        < df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black")
-                            ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black")
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                            )
+                            ax_handle.arrow(
+                                x=intervals[i][0] + np.mean(sub_int),
+                                y=0.9 * y1 + 0.1 * y2,
+                                dx=0,
+                                dy=-0.7 * (y1 - y2),
+                                head_width=0.3 * (sub_int[1] - sub_int[0]),
+                                head_length=0.1 * np.abs(y1 - y2),
+                                fc="black",
+                            )
 
     if add_chrbar:
         # add chr color
-        chr_palette = cycle(['#525252', '#969696', '#cccccc'])
-        lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)}
+        chr_palette = cycle(["#525252", "#969696", "#cccccc"])
+        lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)}
         col_colors = chr_ids.map(lut)
         for i, color in enumerate(col_colors):
-            ax_handle.add_patch(plt.Rectangle(xy=(i, 1 + 0.02*chrbar_thickness), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=rasterized))
+            ax_handle.add_patch(
+                plt.Rectangle(
+                    xy=(i, 1 + 0.02 * chrbar_thickness),
+                    width=1,
+                    height=chrbar_thickness,
+                    color=color,
+                    lw=0,
+                    transform=ax_handle.get_xaxis_transform(),
+                    clip_on=False,
+                    rasterized=rasterized,
+                )
+            )
 
         for c in np.unique(chr_ids.values):
             interval = np.where(chr_ids.values == c)[0]
             mid = np.percentile(interval, 45)
-            ax_handle.text(mid-10, 1 + 0.2*chrbar_thickness, str(c), transform=ax_handle.get_xaxis_transform())
+            ax_handle.text(
+                mid - 10,
+                1 + 0.2 * chrbar_thickness,
+                str(c),
+                transform=ax_handle.get_xaxis_transform(),
+            )
 
     ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0)
     if remove_xticks:
         ax_handle.set_xticks([])
 
     if add_legend:
-        a00 = plt.arrow(0,0, 0,0, color='darkblue')
-        a10 = plt.arrow(0,0, 0,0, color='lightblue')
-        a11 = plt.arrow(0,0, 0,0, color='lightgray')
-        a20 = plt.arrow(0,0, 0,0, color='dimgray')
-        a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow')
-        a30 = plt.arrow(0,0, 0,0, color='gold')
-        a22 = plt.arrow(0,0, 0,0, color='navajowhite')
-        a31 = plt.arrow(0,0, 0,0, color='orange')
-        a40 = plt.arrow(0,0, 0,0, color='darkorange')
-        a32 = plt.arrow(0,0, 0,0, color='salmon')
-        a41 = plt.arrow(0,0, 0,0, color='red')
-        a50 = plt.arrow(0,0, 0,0, color='darkred')
-        a33 = plt.arrow(0,0, 0,0, color='plum')
-        a42 = plt.arrow(0,0, 0,0, color='orchid')
-        a51 = plt.arrow(0,0, 0,0, color='purple')
-        a60 = plt.arrow(0,0, 0,0, color='indigo')
-        ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \
-        ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \
-        '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1))
+        a00 = plt.arrow(0, 0, 0, 0, color="darkblue")
+        a10 = plt.arrow(0, 0, 0, 0, color="lightblue")
+        a11 = plt.arrow(0, 0, 0, 0, color="lightgray")
+        a20 = plt.arrow(0, 0, 0, 0, color="dimgray")
+        a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow")
+        a30 = plt.arrow(0, 0, 0, 0, color="gold")
+        a22 = plt.arrow(0, 0, 0, 0, color="navajowhite")
+        a31 = plt.arrow(0, 0, 0, 0, color="orange")
+        a40 = plt.arrow(0, 0, 0, 0, color="darkorange")
+        a32 = plt.arrow(0, 0, 0, 0, color="salmon")
+        a41 = plt.arrow(0, 0, 0, 0, color="red")
+        a50 = plt.arrow(0, 0, 0, 0, color="darkred")
+        a33 = plt.arrow(0, 0, 0, 0, color="plum")
+        a42 = plt.arrow(0, 0, 0, 0, color="orchid")
+        a51 = plt.arrow(0, 0, 0, 0, color="purple")
+        a60 = plt.arrow(0, 0, 0, 0, color="indigo")
+        ax_handle.legend(
+            [
+                a00,
+                a10,
+                a11,
+                a20,
+                a21,
+                a30,
+                a22,
+                a31,
+                a40,
+                a32,
+                a41,
+                a50,
+                a33,
+                a42,
+                a51,
+                a60,
+            ],
+            [
+                "(0, 0)",
+                "(1, 0)",
+                "(1, 1)",
+                "(2, 0)",
+                "(2, 1)",
+                "(3, 0)",
+                "(2, 2)",
+                "(3, 1)",
+                "(4, 0)",
+                "(3, 2)",
+                "(4, 1)",
+                "(5, 0)",
+                "(3, 3)",
+                "(4, 2)",
+                "(5, 1)",
+                "(6, 0)",
+            ],
+            ncol=2,
+            loc="upper left",
+            bbox_to_anchor=(1, 1),
+        )
     return ax_handle
 
 
-def plot_acn_from_df_anotherscheme(df_cnv, ax_handle, clone_ids=None, clone_names=None, clone_proportions=None, chrbar_pos=None, add_arrow=True, border_linewidth=1, chrbar_thickness=0.1, add_legend=True, remove_xticks=True, rasterized=True):
+def plot_acn_from_df_anotherscheme(
+    df_cnv,
+    ax_handle,
+    clone_ids=None,
+    clone_names=None,
+    clone_proportions=None,
+    chrbar_pos=None,
+    add_arrow=True,
+    border_linewidth=1,
+    chrbar_thickness=0.1,
+    add_legend=True,
+    remove_xticks=True,
+    rasterized=True,
+):
     # full color palette
-    palette,_ = get_full_palette()
+    palette, _ = get_full_palette()
 
     # read CN profiles
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
     print(final_clone_ids)
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
 
     found = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
         found += list(zip(major, minor))
     found = list(set(found))
     found.sort()
 
     # map CN to single digit number
-    map_cn = {x:i for i,x in enumerate(found)}
+    map_cn = {x: i for i, x in enumerate(found)}
     cnv_mapped = []
     ploidy = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] )
-        ploidy.append( np.mean(major + minor) )
-    cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids])
-    ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids])
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))])
+        ploidy.append(np.mean(major + minor))
+    cnv_mapped = pd.DataFrame(
+        np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]
+    )
+    ploidy = pd.DataFrame(
+        np.around(np.array(ploidy), decimals=2).reshape(-1, 1),
+        index=[f"clone {cid}" for cid in final_clone_ids],
+    )
     chr_ids = df_cnv.CHR
 
     colors = [palette[c] for c in found]
     if clone_ids is None:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids]
-        rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)])
+        rename_cnv_mapped = pd.DataFrame(
+            cnv_mapped.values,
+            index=[
+                f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                for c, cid in enumerate(final_clone_ids)
+            ],
+        )
         if len(np.unique(rename_cnv_mapped.values)) == 1:
             colors = colors + colors
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle)
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=rasterized,
+            ax=ax_handle,
+        )
     else:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids]
         if clone_names is None:
-            index_str = [f"clone {cid}\nploidy {tmp_ploidy[c]}"  for c,cid in enumerate(clone_ids)]
+            index_str = [
+                f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                for c, cid in enumerate(clone_ids)
+            ]
         else:
-            index_str = [f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]
+            index_str = [
+                f"{clone_names[c]}\nploidy {tmp_ploidy[c]}"
+                for c, cid in enumerate(clone_ids)
+            ]
         if not clone_proportions is None:
-            index_str = [f"{index_str[c]}\nu={clone_proportions[c]:.2f}" for c in range(len(clone_ids))]
-        rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=index_str)
+            index_str = [
+                f"{index_str[c]}\nu={clone_proportions[c]:.2f}"
+                for c in range(len(clone_ids))
+            ]
+        rename_cnv_mapped = pd.DataFrame(
+            cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+            index=index_str,
+        )
         if len(np.unique(rename_cnv_mapped.values)) == 1:
             colors = colors + colors
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle)
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=rasterized,
+            ax=ax_handle,
+        )
 
     # indicate allele switches
     if add_arrow:
         if clone_ids is None:
             # find regions where there exist both clones with A > B and clones with A < B
-            has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0)
-            has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0)
-            intervals, labs = get_intervals( (has_up & has_down) )
+            has_up = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values
+                        for cid in final_clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            has_down = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values
+                        for cid in final_clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            intervals, labs = get_intervals((has_up & has_down))
             # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow
             for i in range(len(intervals)):
                 if not labs[i]:
                     continue
-                for c,cid in enumerate(final_clone_ids):
+                for c, cid in enumerate(final_clone_ids):
                     y1 = c
-                    y2 = c+1
+                    y2 = c + 1
                     # up-arrow
-                    y_diverge1 = 0.8*y2+0.2*y1
-                    y_diverge2 = 0.6*y2+0.4*y1
-                    y_merge = 0.7*y2+0.3*y1
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    y_diverge1 = 0.8 * y2 + 0.2 * y1
+                    y_diverge2 = 0.6 * y2 + 0.4 * y1
+                    y_merge = 0.7 * y2 + 0.3 * y1
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        > df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
                             # bounding box
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black", linewidth=border_linewidth)
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                                linewidth=border_linewidth,
+                            )
                             # arrow
-                            ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_diverge1,y_merge], [y_diverge2,y_merge], color="black", edgecolor="black")
+                            ax_handle.fill_between(
+                                [
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ],
+                                [y_diverge1, y_merge],
+                                [y_diverge2, y_merge],
+                                color="black",
+                                edgecolor="black",
+                            )
                     # down-arrow
-                    y_diverge1 = 0.2*y2+0.8*y1
-                    y_diverge2 = 0.4*y2+0.6*y1
-                    y_merge = 0.3*y2+0.7*y1
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    y_diverge1 = 0.2 * y2 + 0.8 * y1
+                    y_diverge2 = 0.4 * y2 + 0.6 * y1
+                    y_merge = 0.3 * y2 + 0.7 * y1
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        < df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
                             # bounding box
-                            ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black", linewidth=border_linewidth)
+                            ax_handle.fill_between(
+                                np.arange(
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ),
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                                linewidth=border_linewidth,
+                            )
                             # arrow
-                            ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_merge,y_diverge1], [y_merge,y_diverge2], color="black", edgecolor="black")
+                            ax_handle.fill_between(
+                                [
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ],
+                                [y_merge, y_diverge1],
+                                [y_merge, y_diverge2],
+                                color="black",
+                                edgecolor="black",
+                            )
         else:
             # find regions where there exist both clones with A > B and clones with A < B
-            has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0)
-            has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0)
-            intervals, labs = get_intervals( (has_up & has_down) )
+            has_up = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values
+                        for cid in clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            has_down = np.any(
+                np.vstack(
+                    [
+                        df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values
+                        for cid in clone_ids
+                    ]
+                ),
+                axis=0,
+            )
+            intervals, labs = get_intervals((has_up & has_down))
             # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow
             for i in range(len(intervals)):
                 if not labs[i]:
                     continue
-                for c,cid in enumerate(clone_ids):
+                for c, cid in enumerate(clone_ids):
                     y1 = c
-                    y2 = c+1
+                    y2 = c + 1
                     # up-arrow
-                    y_diverge1 = 0.8*y2+0.2*y1
-                    y_diverge2 = 0.6*y2+0.4*y1
-                    y_merge = 0.7*y2+0.3*y1
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    y_diverge1 = 0.8 * y2 + 0.2 * y1
+                    y_diverge2 = 0.6 * y2 + 0.4 * y1
+                    y_merge = 0.7 * y2 + 0.3 * y1
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        > df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
                             # bounding box
-                            ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], y1, y2, color="none", edgecolor="black", linewidth=border_linewidth)
+                            ax_handle.fill_between(
+                                [
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ],
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                                linewidth=border_linewidth,
+                            )
                             # arrow
-                            ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_diverge1,y_merge], [y_diverge2,y_merge], color="black", edgecolor="black")
+                            ax_handle.fill_between(
+                                [
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ],
+                                [y_diverge1, y_merge],
+                                [y_diverge2, y_merge],
+                                color="black",
+                                edgecolor="black",
+                            )
                     # down-arrow
-                    y_diverge1 = 0.2*y2+0.8*y1
-                    y_diverge2 = 0.4*y2+0.6*y1
-                    y_merge = 0.3*y2+0.7*y1
-                    sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] )
+                    y_diverge1 = 0.2 * y2 + 0.8 * y1
+                    y_diverge2 = 0.4 * y2 + 0.6 * y1
+                    y_merge = 0.3 * y2 + 0.7 * y1
+                    sub_intervals, sub_labs = get_intervals(
+                        df_cnv[f"clone{cid} A"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                        < df_cnv[f"clone{cid} B"].values[
+                            intervals[i][0] : intervals[i][1]
+                        ]
+                    )
                     for j, sub_int in enumerate(sub_intervals):
                         if sub_labs[j]:
                             # bounding box
-                            ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], y1, y2, color="none", edgecolor="black", linewidth=border_linewidth)
+                            ax_handle.fill_between(
+                                [
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ],
+                                y1,
+                                y2,
+                                color="none",
+                                edgecolor="black",
+                                linewidth=border_linewidth,
+                            )
                             # arrow
-                            ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_merge,y_diverge1], [y_merge,y_diverge2], color="black", edgecolor="black")
+                            ax_handle.fill_between(
+                                [
+                                    intervals[i][0] + sub_int[0],
+                                    intervals[i][0] + sub_int[1],
+                                ],
+                                [y_merge, y_diverge1],
+                                [y_merge, y_diverge2],
+                                color="black",
+                                edgecolor="black",
+                            )
 
             # # horizontal separation between clones
             # for c,cid in enumerate(clone_ids[:-1]):
@@ -435,224 +1123,506 @@ def plot_acn_from_df_anotherscheme(df_cnv, ax_handle, clone_ids=None, clone_name
         h = len(final_clone_ids) if clone_ids is None else len(clone_ids)
         # ax_handle.add_patch(plt.Rectangle(xy=(0, h + chrbar_thickness), width=df_cnv.shape[0], height=chrbar_thickness, color='white', lw=0, transform=ax_handle.transData, clip_on=False, rasterized=rasterized))
 
-        for i,c in enumerate(np.unique(chr_ids.values)):
+        for i, c in enumerate(np.unique(chr_ids.values)):
             interval = np.where(chr_ids.values == c)[0]
             # add vertical separation between chromosomes
             if not np.max(interval) + 1 >= df_cnv.shape[0]:
-                ax_handle.axvline(x=np.max(interval), color='black', lw=0.5, ymin=-0.5/(h+1), clip_on = False)
+                ax_handle.axvline(
+                    x=np.max(interval),
+                    color="black",
+                    lw=0.5,
+                    ymin=-0.5 / (h + 1),
+                    clip_on=False,
+                )
             mid = np.percentile(interval, 45)
             if i % 2 == 0:
-                ax_handle.text(mid, h + chrbar_thickness, str(c), ha='center', transform=ax_handle.transData)
+                ax_handle.text(
+                    mid,
+                    h + chrbar_thickness,
+                    str(c),
+                    ha="center",
+                    transform=ax_handle.transData,
+                )
             else:
-                ax_handle.text(mid, h + 2*chrbar_thickness, str(c), ha='center', transform=ax_handle.transData)
+                ax_handle.text(
+                    mid,
+                    h + 2 * chrbar_thickness,
+                    str(c),
+                    ha="center",
+                    transform=ax_handle.transData,
+                )
     elif chrbar_pos == "top":
         chr_ids = df_cnv.CHR
         h = len(final_clone_ids) if clone_ids is None else len(clone_ids)
         # ax_handle.add_patch(plt.Rectangle(xy=(0, h + chrbar_thickness), width=df_cnv.shape[0], height=chrbar_thickness, color='white', lw=0, transform=ax_handle.transData, clip_on=False, rasterized=rasterized))
 
-        for i,c in enumerate(np.unique(chr_ids.values)):
+        for i, c in enumerate(np.unique(chr_ids.values)):
             interval = np.where(chr_ids.values == c)[0]
             # add vertical separation between chromosomes
             if not np.max(interval) + 1 >= df_cnv.shape[0]:
-                ax_handle.axvline(x=np.max(interval), color='black', lw=0.5, ymax=1+0.5/(h+1), clip_on = False)
+                ax_handle.axvline(
+                    x=np.max(interval),
+                    color="black",
+                    lw=0.5,
+                    ymax=1 + 0.5 / (h + 1),
+                    clip_on=False,
+                )
             mid = np.percentile(interval, 45)
             if i % 2 == 0:
-                ax_handle.text(mid, -0.1*chrbar_thickness, str(c), ha='center', transform=ax_handle.transData)
+                ax_handle.text(
+                    mid,
+                    -0.1 * chrbar_thickness,
+                    str(c),
+                    ha="center",
+                    transform=ax_handle.transData,
+                )
             else:
-                ax_handle.text(mid, -0.8*chrbar_thickness, str(c), ha='center', transform=ax_handle.transData)
+                ax_handle.text(
+                    mid,
+                    -0.8 * chrbar_thickness,
+                    str(c),
+                    ha="center",
+                    transform=ax_handle.transData,
+                )
 
     ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0)
     if remove_xticks:
         ax_handle.set_xticks([])
 
     if add_legend:
-        a00 = plt.arrow(0,0, 0,0, color='darkblue')
-        a10 = plt.arrow(0,0, 0,0, color='lightblue')
-        a11 = plt.arrow(0,0, 0,0, color='lightgray')
-        a20 = plt.arrow(0,0, 0,0, color='dimgray')
-        a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow')
-        a30 = plt.arrow(0,0, 0,0, color='gold')
-        a22 = plt.arrow(0,0, 0,0, color='navajowhite')
-        a31 = plt.arrow(0,0, 0,0, color='orange')
-        a40 = plt.arrow(0,0, 0,0, color='darkorange')
-        a32 = plt.arrow(0,0, 0,0, color='salmon')
-        a41 = plt.arrow(0,0, 0,0, color='red')
-        a50 = plt.arrow(0,0, 0,0, color='darkred')
-        a33 = plt.arrow(0,0, 0,0, color='plum')
-        a42 = plt.arrow(0,0, 0,0, color='orchid')
-        a51 = plt.arrow(0,0, 0,0, color='purple')
-        a60 = plt.arrow(0,0, 0,0, color='indigo')
-        ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \
-        ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \
-        '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1))
+        a00 = plt.arrow(0, 0, 0, 0, color="darkblue")
+        a10 = plt.arrow(0, 0, 0, 0, color="lightblue")
+        a11 = plt.arrow(0, 0, 0, 0, color="lightgray")
+        a20 = plt.arrow(0, 0, 0, 0, color="dimgray")
+        a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow")
+        a30 = plt.arrow(0, 0, 0, 0, color="gold")
+        a22 = plt.arrow(0, 0, 0, 0, color="navajowhite")
+        a31 = plt.arrow(0, 0, 0, 0, color="orange")
+        a40 = plt.arrow(0, 0, 0, 0, color="darkorange")
+        a32 = plt.arrow(0, 0, 0, 0, color="salmon")
+        a41 = plt.arrow(0, 0, 0, 0, color="red")
+        a50 = plt.arrow(0, 0, 0, 0, color="darkred")
+        a33 = plt.arrow(0, 0, 0, 0, color="plum")
+        a42 = plt.arrow(0, 0, 0, 0, color="orchid")
+        a51 = plt.arrow(0, 0, 0, 0, color="purple")
+        a60 = plt.arrow(0, 0, 0, 0, color="indigo")
+        ax_handle.legend(
+            [
+                a00,
+                a10,
+                a11,
+                a20,
+                a21,
+                a30,
+                a22,
+                a31,
+                a40,
+                a32,
+                a41,
+                a50,
+                a33,
+                a42,
+                a51,
+                a60,
+            ],
+            [
+                "(0, 0)",
+                "(1, 0)",
+                "(1, 1)",
+                "(2, 0)",
+                "(2, 1)",
+                "(3, 0)",
+                "(2, 2)",
+                "(3, 1)",
+                "(4, 0)",
+                "(3, 2)",
+                "(4, 1)",
+                "(5, 0)",
+                "(3, 3)",
+                "(4, 2)",
+                "(5, 1)",
+                "(6, 0)",
+            ],
+            ncol=2,
+            loc="upper left",
+            bbox_to_anchor=(1, 1),
+        )
     return ax_handle
 
 
-
 def plot_acn_legend(fig, shift_y=0.3):
     # full palette
     palette, ordered_acn = get_full_palette()
 
-    map_cn = {x:i for i,x in enumerate(ordered_acn)}
+    map_cn = {x: i for i, x in enumerate(ordered_acn)}
     colors = [palette[c] for c in ordered_acn]
-    cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors))
+    cmap = LinearSegmentedColormap.from_list("multi-level", colors, len(colors))
 
-    n_total_cn = np.max([x[0]+x[1] for x in ordered_acn]) + 1
-    gs = GridSpec(2*n_total_cn-1, 1, figure=fig)
+    n_total_cn = np.max([x[0] + x[1] for x in ordered_acn]) + 1
+    gs = GridSpec(2 * n_total_cn - 1, 1, figure=fig)
 
     # total cn = 0
-    ax = fig.add_subplot(gs[2*n_total_cn-2, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(0,0)]]).reshape((1,-1)), columns=["{0,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 2, :])
+    seaborn.heatmap(
+        pd.DataFrame(np.array([map_cn[(0, 0)]]).reshape((1, -1)), columns=["{0,0}"]),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y))
 
     # total cn = 1
-    ax = fig.add_subplot(gs[2*n_total_cn-4, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(1,0)]]).reshape((1,-1)), columns=["{1,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 4, :])
+    seaborn.heatmap(
+        pd.DataFrame(np.array([map_cn[(1, 0)]]).reshape((1, -1)), columns=["{1,0}"]),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y))
 
     # total cn = 2
-    ax = fig.add_subplot(gs[2*n_total_cn-6, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(1,1)], map_cn[(2,0)]]).reshape((1,-1)), columns=["{1,1}", "{2,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 6, :])
+    seaborn.heatmap(
+        pd.DataFrame(
+            np.array([map_cn[(1, 1)], map_cn[(2, 0)]]).reshape((1, -1)),
+            columns=["{1,1}", "{2,0}"],
+        ),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,0.3))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, 0.3))
 
     # total cn = 3
-    ax = fig.add_subplot(gs[2*n_total_cn-8, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(2,1)], map_cn[(3,0)]]).reshape((1,-1)), columns=["{2,1}", "{3,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 8, :])
+    seaborn.heatmap(
+        pd.DataFrame(
+            np.array([map_cn[(2, 1)], map_cn[(3, 0)]]).reshape((1, -1)),
+            columns=["{2,1}", "{3,0}"],
+        ),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y))
 
     # total cn = 4
-    ax = fig.add_subplot(gs[2*n_total_cn-10, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(2,2)], map_cn[(3,1)], map_cn[(4,0)]]).reshape((1,-1)), columns=["{2,2}", "{3,1}", "{4,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 10, :])
+    seaborn.heatmap(
+        pd.DataFrame(
+            np.array([map_cn[(2, 2)], map_cn[(3, 1)], map_cn[(4, 0)]]).reshape((1, -1)),
+            columns=["{2,2}", "{3,1}", "{4,0}"],
+        ),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y))
 
     # total cn = 5
-    ax = fig.add_subplot(gs[2*n_total_cn-12, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(3,2)], map_cn[(4,1)], map_cn[(5,0)]]).reshape((1,-1)), columns=["{3,2}", "{4,1}", "{5,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 12, :])
+    seaborn.heatmap(
+        pd.DataFrame(
+            np.array([map_cn[(3, 2)], map_cn[(4, 1)], map_cn[(5, 0)]]).reshape((1, -1)),
+            columns=["{3,2}", "{4,1}", "{5,0}"],
+        ),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y))
 
     # total cn = 6
-    ax = fig.add_subplot(gs[2*n_total_cn-14, :])
-    seaborn.heatmap( pd.DataFrame(np.array([map_cn[(3,3)], map_cn[(4,2)], map_cn[(5,1)], map_cn[(6,0)]]).reshape((1,-1)), columns=["{3,3}", "{4,2}", "{5,1}", "{6,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" )
+    ax = fig.add_subplot(gs[2 * n_total_cn - 14, :])
+    seaborn.heatmap(
+        pd.DataFrame(
+            np.array(
+                [map_cn[(3, 3)], map_cn[(4, 2)], map_cn[(5, 1)], map_cn[(6, 0)]]
+            ).reshape((1, -1)),
+            columns=["{3,3}", "{4,2}", "{5,1}", "{6,0}"],
+        ),
+        vmin=0,
+        vmax=len(colors),
+        cmap=cmap,
+        cbar=False,
+        linewidths=1,
+        linecolor="black",
+    )
     ax.set_yticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y))
+    ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y))
 
     return fig
 
 
-def plot_acn_withhighlight(cn_file, df_highlight_events, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True):
+def plot_acn_withhighlight(
+    cn_file,
+    df_highlight_events,
+    ax_handle,
+    clone_ids=None,
+    clone_names=None,
+    add_chrbar=True,
+    chrbar_thickness=0.1,
+    add_legend=True,
+    remove_xticks=True,
+):
     """
     df_highlight_events: dataframe with columns: ["BinSTART", "BinEND", "involved_clones"]
     """
     # full color palette
-    palette,_ = get_full_palette()
+    palette, _ = get_full_palette()
 
     # read CN profiles
     df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
     print(final_clone_ids)
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
 
     found = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
         found += list(zip(major, minor))
     found = list(set(found))
     found.sort()
 
     # map CN to single digit number
-    map_cn = {x:i for i,x in enumerate(found)}
+    map_cn = {x: i for i, x in enumerate(found)}
     cnv_mapped = []
     ploidy = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] )
-        ploidy.append( np.mean(major + minor) )
-    cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids])
-    ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids])
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))])
+        ploidy.append(np.mean(major + minor))
+    cnv_mapped = pd.DataFrame(
+        np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]
+    )
+    ploidy = pd.DataFrame(
+        np.around(np.array(ploidy), decimals=2).reshape(-1, 1),
+        index=[f"clone {cid}" for cid in final_clone_ids],
+    )
     chr_ids = df_cnv.CHR
 
     colors = [palette[c] for c in found]
     if clone_ids is None:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids]
-        rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)])
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+        rename_cnv_mapped = pd.DataFrame(
+            cnv_mapped.values,
+            index=[
+                f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                for c, cid in enumerate(final_clone_ids)
+            ],
+        )
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
     else:
         tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids]
         if clone_names is None:
-            rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)])
+            rename_cnv_mapped = pd.DataFrame(
+                cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[
+                    f"clone {cid}\nploidy {tmp_ploidy[c]}"
+                    for c, cid in enumerate(clone_ids)
+                ],
+            )
         else:
-            rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)])
-        seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+            rename_cnv_mapped = pd.DataFrame(
+                cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[
+                    f"{clone_names[c]}\nploidy {tmp_ploidy[c]}"
+                    for c, cid in enumerate(clone_ids)
+                ],
+            )
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)),
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
 
     for i in range(df_highlight_events.shape[0]):
         involved_clones = df_highlight_events.involved_clones.values[i]
         # interval start and end
-        interval = [df_highlight_events.BinSTART.values[i], df_highlight_events.BinEND.values[i]]
+        interval = [
+            df_highlight_events.BinSTART.values[i],
+            df_highlight_events.BinEND.values[i],
+        ]
         if clone_ids is None:
             for c, cid in enumerate(final_clone_ids):
                 if not cid in involved_clones:
                     continue
                 y1 = c
-                y2 = c+1
-                ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2)
+                y2 = c + 1
+                ax_handle.fill_between(
+                    np.arange(interval[0], interval[1]),
+                    y1,
+                    y2,
+                    color="none",
+                    edgecolor="black",
+                    linewidth=2,
+                )
         else:
             for c, cid in enumerate(clone_ids):
                 if not cid in involved_clones:
                     continue
                 y1 = c
-                y2 = c+1
-                ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2)
-        
+                y2 = c + 1
+                ax_handle.fill_between(
+                    np.arange(interval[0], interval[1]),
+                    y1,
+                    y2,
+                    color="none",
+                    edgecolor="black",
+                    linewidth=2,
+                )
+
     if add_chrbar:
         # add chr color
-        chr_palette = cycle(['#525252', '#969696', '#cccccc'])
-        lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)}
+        chr_palette = cycle(["#525252", "#969696", "#cccccc"])
+        lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)}
         col_colors = chr_ids.map(lut)
         for i, color in enumerate(col_colors):
-            ax_handle.add_patch(plt.Rectangle(xy=(i, 1.01), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True))
+            ax_handle.add_patch(
+                plt.Rectangle(
+                    xy=(i, 1.01),
+                    width=1,
+                    height=chrbar_thickness,
+                    color=color,
+                    lw=0,
+                    transform=ax_handle.get_xaxis_transform(),
+                    clip_on=False,
+                    rasterized=True,
+                )
+            )
 
         for c in np.unique(chr_ids.values):
             interval = np.where(chr_ids.values == c)[0]
             mid = np.percentile(interval, 45)
-            ax_handle.text(mid-10, 1.04, str(c), transform=ax_handle.get_xaxis_transform())
+            ax_handle.text(
+                mid - 10, 1.04, str(c), transform=ax_handle.get_xaxis_transform()
+            )
 
     ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0)
     if remove_xticks:
         ax_handle.set_xticks([])
 
     if add_legend:
-        a00 = plt.arrow(0,0, 0,0, 
-        color='darkblue')
-        a10 = plt.arrow(0,0, 0,0, color='lightblue')
-        a11 = plt.arrow(0,0, 0,0, color='lightgray')
-        a20 = plt.arrow(0,0, 0,0, color='dimgray')
-        a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow')
-        a30 = plt.arrow(0,0, 0,0, color='gold')
-        a22 = plt.arrow(0,0, 0,0, color='navajowhite')
-        a31 = plt.arrow(0,0, 0,0, color='orange')
-        a40 = plt.arrow(0,0, 0,0, color='darkorange')
-        a32 = plt.arrow(0,0, 0,0, color='salmon')
-        a41 = plt.arrow(0,0, 0,0, color='red')
-        a50 = plt.arrow(0,0, 0,0, color='darkred')
-        a33 = plt.arrow(0,0, 0,0, color='plum')
-        a42 = plt.arrow(0,0, 0,0, color='orchid')
-        a51 = plt.arrow(0,0, 0,0, color='purple')
-        a60 = plt.arrow(0,0, 0,0, color='indigo')
-        ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \
-        ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \
-        '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1 - 0.1 * min(0, rename_cnv_mapped.shape[0]-6)))
+        a00 = plt.arrow(0, 0, 0, 0, color="darkblue")
+        a10 = plt.arrow(0, 0, 0, 0, color="lightblue")
+        a11 = plt.arrow(0, 0, 0, 0, color="lightgray")
+        a20 = plt.arrow(0, 0, 0, 0, color="dimgray")
+        a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow")
+        a30 = plt.arrow(0, 0, 0, 0, color="gold")
+        a22 = plt.arrow(0, 0, 0, 0, color="navajowhite")
+        a31 = plt.arrow(0, 0, 0, 0, color="orange")
+        a40 = plt.arrow(0, 0, 0, 0, color="darkorange")
+        a32 = plt.arrow(0, 0, 0, 0, color="salmon")
+        a41 = plt.arrow(0, 0, 0, 0, color="red")
+        a50 = plt.arrow(0, 0, 0, 0, color="darkred")
+        a33 = plt.arrow(0, 0, 0, 0, color="plum")
+        a42 = plt.arrow(0, 0, 0, 0, color="orchid")
+        a51 = plt.arrow(0, 0, 0, 0, color="purple")
+        a60 = plt.arrow(0, 0, 0, 0, color="indigo")
+        ax_handle.legend(
+            [
+                a00,
+                a10,
+                a11,
+                a20,
+                a21,
+                a30,
+                a22,
+                a31,
+                a40,
+                a32,
+                a41,
+                a50,
+                a33,
+                a42,
+                a51,
+                a60,
+            ],
+            [
+                "(0, 0)",
+                "(1, 0)",
+                "(1, 1)",
+                "(2, 0)",
+                "(2, 1)",
+                "(3, 0)",
+                "(2, 2)",
+                "(3, 1)",
+                "(4, 0)",
+                "(3, 2)",
+                "(4, 1)",
+                "(5, 0)",
+                "(3, 3)",
+                "(4, 2)",
+                "(5, 1)",
+                "(6, 0)",
+            ],
+            ncol=2,
+            loc="upper left",
+            bbox_to_anchor=(1, 1 - 0.1 * min(0, rename_cnv_mapped.shape[0] - 6)),
+        )
     return ax_handle
 
 
-def plot_total_cn(df_cnv, ax_handle, df_highlight_events=None, palette_mode=6, clone_ids=None, clone_names=None, add_chrbar=True, chrbar_thickness=0.1, add_legend=True, legend_position="upper left", remove_xticks=True):
+def plot_total_cn(
+    df_cnv,
+    ax_handle,
+    df_highlight_events=None,
+    palette_mode=6,
+    clone_ids=None,
+    clone_names=None,
+    add_chrbar=True,
+    chrbar_thickness=0.1,
+    add_legend=True,
+    legend_position="upper left",
+    remove_xticks=True,
+):
     """
     df_cnv : pandas.DataFrame
         Each row is a genomic bin, containing columns "CHR", "clone {cid}" for each clone id.
@@ -663,60 +1633,127 @@ def plot_total_cn(df_cnv, ax_handle, df_highlight_events=None, palette_mode=6, c
 
     # create a cmap that map "amp" to #B44F3D, "bamp" to #E18073, "bdel" to #A0CEEA, "del" to #4F69DF, "loh" to #738B2D
     if palette_mode == 6:
-        full_palette = {"amp":"#B44F3D", "bamp":"#E18073", "bdel":"#A0CEEA", "del":"#4F69DF", "loh":"#738B2D", "neu":"lightgrey"}
+        full_palette = {
+            "amp": "#B44F3D",
+            "bamp": "#E18073",
+            "bdel": "#A0CEEA",
+            "del": "#4F69DF",
+            "loh": "#738B2D",
+            "neu": "lightgrey",
+        }
     else:
-        full_palette = {"amp":"#B44F3D", "del":"#4F69DF", "neu":"lightgrey"}
+        full_palette = {"amp": "#B44F3D", "del": "#4F69DF", "neu": "lightgrey"}
 
     if clone_ids is None:
-        found = np.unique(df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].values.flatten())
-        lut = {x:i for i,x in enumerate(found)}
+        found = np.unique(
+            df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].values.flatten()
+        )
+        lut = {x: i for i, x in enumerate(found)}
         palette = matplotlib.colors.ListedColormap([full_palette[x] for x in found])
-        df_cnv_mapped = df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].replace(lut)
+        df_cnv_mapped = df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].replace(
+            lut
+        )
         df_cnv_mapped = df_cnv_mapped.T
-        seaborn.heatmap(df_cnv_mapped, cmap=palette, linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+        seaborn.heatmap(
+            df_cnv_mapped,
+            cmap=palette,
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
     else:
-        found = np.unique(df_cnv[[f"clone {cid}" for cid in clone_ids]].values.flatten())
-        lut = {x:i for i,x in enumerate(found)}
+        found = np.unique(
+            df_cnv[[f"clone {cid}" for cid in clone_ids]].values.flatten()
+        )
+        lut = {x: i for i, x in enumerate(found)}
         palette = matplotlib.colors.ListedColormap([full_palette[x] for x in found])
         df_cnv_mapped = df_cnv[[f"clone {cid}" for cid in clone_ids]].replace(lut)
         df_cnv_mapped = df_cnv_mapped.T
         if not clone_names is None:
-            df_cnv_mapped.rename(index={f"clone {cid}":clone_names[i] for i,cid in enumerate(clone_ids)}, inplace=True)
-        seaborn.heatmap(df_cnv_mapped, cmap=palette, linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+            df_cnv_mapped.rename(
+                index={
+                    f"clone {cid}": clone_names[i] for i, cid in enumerate(clone_ids)
+                },
+                inplace=True,
+            )
+        seaborn.heatmap(
+            df_cnv_mapped,
+            cmap=palette,
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
 
     if not df_highlight_events is None:
-        final_clone_ids = [x.split(" ")[1] for x in df_cnv.columns if x.startswith("clone")]
+        final_clone_ids = [
+            x.split(" ")[1] for x in df_cnv.columns if x.startswith("clone")
+        ]
         for i in range(df_highlight_events.shape[0]):
             involved_clones = df_highlight_events.involved_clones.values[i]
             # interval start and end
-            interval = [df_highlight_events.BinSTART.values[i], df_highlight_events.BinEND.values[i]]
+            interval = [
+                df_highlight_events.BinSTART.values[i],
+                df_highlight_events.BinEND.values[i],
+            ]
             if clone_ids is None:
                 for c, cid in enumerate(final_clone_ids):
                     if not cid in involved_clones:
                         continue
                     y1 = c
-                    y2 = c+1
-                    ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2)
+                    y2 = c + 1
+                    ax_handle.fill_between(
+                        np.arange(interval[0], interval[1]),
+                        y1,
+                        y2,
+                        color="none",
+                        edgecolor="black",
+                        linewidth=2,
+                    )
             else:
                 for c, cid in enumerate(clone_ids):
                     if not cid in involved_clones:
                         continue
                     y1 = c
-                    y2 = c+1
-                    ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2)
+                    y2 = c + 1
+                    ax_handle.fill_between(
+                        np.arange(interval[0], interval[1]),
+                        y1,
+                        y2,
+                        color="none",
+                        edgecolor="black",
+                        linewidth=2,
+                    )
 
     if add_chrbar:
         # add chr color
-        chr_palette = cycle(['#525252', '#969696', '#cccccc'])
-        lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)}
+        chr_palette = cycle(["#525252", "#969696", "#cccccc"])
+        lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)}
         col_colors = chr_ids.map(lut)
         for i, color in enumerate(col_colors):
-            ax_handle.add_patch(plt.Rectangle(xy=(i, 1 + 0.02*chrbar_thickness), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True))
+            ax_handle.add_patch(
+                plt.Rectangle(
+                    xy=(i, 1 + 0.02 * chrbar_thickness),
+                    width=1,
+                    height=chrbar_thickness,
+                    color=color,
+                    lw=0,
+                    transform=ax_handle.get_xaxis_transform(),
+                    clip_on=False,
+                    rasterized=True,
+                )
+            )
 
         for c in np.unique(chr_ids.values):
             interval = np.where(chr_ids.values == c)[0]
             mid = np.percentile(interval, 45)
-            ax_handle.text(mid-10, 1 + 0.2*chrbar_thickness, str(c), transform=ax_handle.get_xaxis_transform())
+            ax_handle.text(
+                mid - 10,
+                1 + 0.2 * chrbar_thickness,
+                str(c),
+                transform=ax_handle.get_xaxis_transform(),
+            )
 
     ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0)
     if remove_xticks:
@@ -724,73 +1761,150 @@ def plot_total_cn(df_cnv, ax_handle, df_highlight_events=None, palette_mode=6, c
 
     if add_legend:
         if palette_mode == 6:
-            a0 = plt.arrow(0,0, 0,0, color='#B44F3D')
-            a1 = plt.arrow(0,0, 0,0, color='#E18073')
-            a2 = plt.arrow(0,0, 0,0, color='lightgrey')
-            a3 = plt.arrow(0,0, 0,0, color='#A0CEEA')
-            a4 = plt.arrow(0,0, 0,0, color='#4F69DF')
-            a5 = plt.arrow(0,0, 0,0, color='#738B2D')
+            a0 = plt.arrow(0, 0, 0, 0, color="#B44F3D")
+            a1 = plt.arrow(0, 0, 0, 0, color="#E18073")
+            a2 = plt.arrow(0, 0, 0, 0, color="lightgrey")
+            a3 = plt.arrow(0, 0, 0, 0, color="#A0CEEA")
+            a4 = plt.arrow(0, 0, 0, 0, color="#4F69DF")
+            a5 = plt.arrow(0, 0, 0, 0, color="#738B2D")
             if legend_position == "upper left":
-                ax_handle.legend([a0, a1, a2, a3, a4, a5], ["amp", "bamp", "neu", "bdel", "del", "loh"], loc='upper left', bbox_to_anchor=(1,1 - 0.1 * min(0, df_cnv_mapped.shape[0]-5)))
+                ax_handle.legend(
+                    [a0, a1, a2, a3, a4, a5],
+                    ["amp", "bamp", "neu", "bdel", "del", "loh"],
+                    loc="upper left",
+                    bbox_to_anchor=(1, 1 - 0.1 * min(0, df_cnv_mapped.shape[0] - 5)),
+                )
             else:
-                ax_handle.legend([a0, a1, a2, a3, a4, a5], ["amp", "bamp", "neu", "bdel", "del", "loh"], loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=6)
+                ax_handle.legend(
+                    [a0, a1, a2, a3, a4, a5],
+                    ["amp", "bamp", "neu", "bdel", "del", "loh"],
+                    loc="lower center",
+                    bbox_to_anchor=(0.5, -0.25),
+                    ncol=6,
+                )
         else:
-            a0 = plt.arrow(0,0, 0,0, color='#B44F3D')
-            a1 = plt.arrow(0,0, 0,0, color='lightgrey')
-            a2 = plt.arrow(0,0, 0,0, color='#4F69DF')
+            a0 = plt.arrow(0, 0, 0, 0, color="#B44F3D")
+            a1 = plt.arrow(0, 0, 0, 0, color="lightgrey")
+            a2 = plt.arrow(0, 0, 0, 0, color="#4F69DF")
             if legend_position == "upper left":
-                ax_handle.legend([a0, a1, a2], ["amp", "neu", "del"], loc='upper left', bbox_to_anchor=(1,1 - 0.1 * min(0, df_cnv_mapped.shape[0]-2)))
+                ax_handle.legend(
+                    [a0, a1, a2],
+                    ["amp", "neu", "del"],
+                    loc="upper left",
+                    bbox_to_anchor=(1, 1 - 0.1 * min(0, df_cnv_mapped.shape[0] - 2)),
+                )
             else:
-                ax_handle.legend([a0, a1, a2], ["amp", "neu", "del"], loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=3)
+                ax_handle.legend(
+                    [a0, a1, a2],
+                    ["amp", "neu", "del"],
+                    loc="lower center",
+                    bbox_to_anchor=(0.5, -0.25),
+                    ncol=3,
+                )
 
     return ax_handle
 
 
-def plot_amp_del(cn_file, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True):
+def plot_amp_del(
+    cn_file,
+    ax_handle,
+    clone_ids=None,
+    clone_names=None,
+    add_chrbar=True,
+    chrbar_thickness=0.1,
+    add_legend=True,
+    remove_xticks=True,
+):
     # define color palette that maps 0 to lightgrey, -2 and -1 to blues with increasing intensity, and 1 and 2 to reds with increasing intensity
-    palette_map = {-2+i:x for i,x in enumerate(seaborn.color_palette("coolwarm", 5).as_hex())}
-    
+    palette_map = {
+        -2 + i: x for i, x in enumerate(seaborn.color_palette("coolwarm", 5).as_hex())
+    }
+
     # read CN profiles
     df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
 
     # compute the relative copy number with respect to the median copy number per clone
     df_cnv_rel = []
     for cid in final_clone_ids:
-        major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-        minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+        major = np.maximum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
+        minor = np.minimum(
+            df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+        )
         median_copy = np.median(major + minor)
         # clamp the relative copy number major + minor - median_copy to [-2,2]
-        df_cnv_rel.append( np.minimum(2, np.maximum(-2, major + minor - median_copy)) )
-    df_cnv_rel = pd.DataFrame( np.array(df_cnv_rel), index=[f"clone {cid}" for cid in final_clone_ids])
+        df_cnv_rel.append(np.minimum(2, np.maximum(-2, major + minor - median_copy)))
+    df_cnv_rel = pd.DataFrame(
+        np.array(df_cnv_rel), index=[f"clone {cid}" for cid in final_clone_ids]
+    )
 
     # plot heatmap
     if clone_ids is None:
-        rename_cnv_mapped = pd.DataFrame(df_cnv_rel.values, index=[f"clone {cid}" for c,cid in enumerate(final_clone_ids)])
+        rename_cnv_mapped = pd.DataFrame(
+            df_cnv_rel.values,
+            index=[f"clone {cid}" for c, cid in enumerate(final_clone_ids)],
+        )
         unique_cnv_values = np.unique(rename_cnv_mapped.values)
-        seaborn.heatmap(rename_cnv_mapped, cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]), linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]),
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
     else:
         if clone_names is None:
-            rename_cnv_mapped = pd.DataFrame(df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}" for c,cid in enumerate(clone_ids)])
+            rename_cnv_mapped = pd.DataFrame(
+                df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[f"clone {cid}" for c, cid in enumerate(clone_ids)],
+            )
         else:
-            rename_cnv_mapped = pd.DataFrame(df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}" for c,cid in enumerate(clone_ids)])
+            rename_cnv_mapped = pd.DataFrame(
+                df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values,
+                index=[f"{clone_names[c]}" for c, cid in enumerate(clone_ids)],
+            )
         unique_cnv_values = np.unique(rename_cnv_mapped.values)
-        seaborn.heatmap(rename_cnv_mapped, cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]), linewidths=0, cbar=False, rasterized=True, ax=ax_handle)
-    
+        seaborn.heatmap(
+            rename_cnv_mapped,
+            cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]),
+            linewidths=0,
+            cbar=False,
+            rasterized=True,
+            ax=ax_handle,
+        )
+
     if add_chrbar:
         chr_ids = df_cnv.CHR
         # add chr color
-        chr_palette = cycle(['#525252', '#969696', '#cccccc'])
-        lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)}
+        chr_palette = cycle(["#525252", "#969696", "#cccccc"])
+        lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)}
         col_colors = chr_ids.map(lut)
         for i, color in enumerate(col_colors):
-            ax_handle.add_patch(plt.Rectangle(xy=(i, 1.01), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True))
+            ax_handle.add_patch(
+                plt.Rectangle(
+                    xy=(i, 1.01),
+                    width=1,
+                    height=chrbar_thickness,
+                    color=color,
+                    lw=0,
+                    transform=ax_handle.get_xaxis_transform(),
+                    clip_on=False,
+                    rasterized=True,
+                )
+            )
 
         for c in np.unique(chr_ids.values):
             interval = np.where(chr_ids.values == c)[0]
             mid = np.percentile(interval, 45)
-            ax_handle.text(mid-10, 1.04, str(c), transform=ax_handle.get_xaxis_transform())
+            ax_handle.text(
+                mid - 10, 1.04, str(c), transform=ax_handle.get_xaxis_transform()
+            )
 
     ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0)
     if remove_xticks:
@@ -798,34 +1912,54 @@ def plot_amp_del(cn_file, ax_handle, clone_ids=None, clone_names=None, add_chrba
 
     # add legend corresponding to palette
     if add_legend:
-        a0 = plt.arrow(0,0, 0,0, color=palette_map[-2])
-        a1 = plt.arrow(0,0, 0,0, color=palette_map[-1])
-        a2 = plt.arrow(0,0, 0,0, color=palette_map[0])
-        a3 = plt.arrow(0,0, 0,0, color=palette_map[1])
-        a4 = plt.arrow(0,0, 0,0, color=palette_map[2])
-        ax_handle.legend([a0, a1, a2, a3, a4], ['-2 and below','-1','0','1', '2 and above'], ncol=1, loc='upper left', bbox_to_anchor=(1,1))
+        a0 = plt.arrow(0, 0, 0, 0, color=palette_map[-2])
+        a1 = plt.arrow(0, 0, 0, 0, color=palette_map[-1])
+        a2 = plt.arrow(0, 0, 0, 0, color=palette_map[0])
+        a3 = plt.arrow(0, 0, 0, 0, color=palette_map[1])
+        a4 = plt.arrow(0, 0, 0, 0, color=palette_map[2])
+        ax_handle.legend(
+            [a0, a1, a2, a3, a4],
+            ["-2 and below", "-1", "0", "1", "2 and above"],
+            ncol=1,
+            loc="upper left",
+            bbox_to_anchor=(1, 1),
+        )
 
     return ax_handle
 
 
-
-def plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, clone_names=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=15, linewidth=1, palette="chisel"):
+def plot_rdr_baf(
+    configuration_file,
+    r_hmrf_initialization,
+    cn_file,
+    clone_ids=None,
+    clone_names=None,
+    remove_xticks=True,
+    rdr_ylim=5,
+    chrtext_shift=-0.3,
+    base_height=3.2,
+    pointsize=15,
+    linewidth=1,
+    palette="chisel",
+):
     # full palette
     chisel_palette, ordered_acn = get_full_palette()
-    map_cn = {x:i for i,x in enumerate(ordered_acn)}
+    map_cn = {x: i for i, x in enumerate(ordered_acn)}
     colors = [chisel_palette[c] for c in ordered_acn]
 
     try:
         config = read_configuration_file(configuration_file)
     except:
         config = read_joint_configuration_file(configuration_file)
-    
+
     # load allele specific integer copy numbers
     df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
-    if not '0' in final_clone_ids:
-        final_clone_ids = np.array(['0'] + list(final_clone_ids))
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
+    if not "0" in final_clone_ids:
+        final_clone_ids = np.array(["0"] + list(final_clone_ids))
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
     unique_chrs = np.unique(df_cnv.CHR.values)
 
     # load data
@@ -836,144 +1970,362 @@ def plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=N
     single_base_nb_mean = dat["single_base_nb_mean"]
     single_total_bb_RD = dat["single_total_bb_RD"]
     single_tumor_prop = dat["single_tumor_prop"]
-    res_combine = dict( np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True) )
+    res_combine = dict(
+        np.load(
+            f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+            allow_pickle=True,
+        )
+    )
 
     n_states = res_combine["new_p_binom"].shape[0]
 
     assert single_X.shape[0] == df_cnv.shape[0]
 
-    clone_index = [np.where(res_combine["new_assignment"] == c)[0] for c,cid in enumerate(final_clone_ids)]
+    clone_index = [
+        np.where(res_combine["new_assignment"] == c)[0]
+        for c, cid in enumerate(final_clone_ids)
+    ]
     if config["tumorprop_file"] is None:
-        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index)
+        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+            single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+        )
         tumor_prop = None
     else:
-        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop)
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+            single_X,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            clone_index,
+            single_tumor_prop,
+        )
     n_obs = X.shape[0]
     nonempty_clones = np.where(np.sum(total_bb_RD, axis=0) > 0)[0]
 
     # plotting all clones
     if clone_ids is None:
-        fig, axes = plt.subplots(2*len(nonempty_clones), 1, figsize=(20, base_height*len(nonempty_clones)), dpi=200, facecolor="white")
-        for s,c in enumerate(nonempty_clones):
+        fig, axes = plt.subplots(
+            2 * len(nonempty_clones),
+            1,
+            figsize=(20, base_height * len(nonempty_clones)),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, c in enumerate(nonempty_clones):
             cid = final_clone_ids[c]
             # major and minor allele copies give the hue
-            major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-            minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+            major = np.maximum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
+            minor = np.minimum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
 
             # plot points
-            segments, labs = get_intervals(res_combine["pred_cnv"][:,c])
+            segments, labs = get_intervals(res_combine["pred_cnv"][:, c])
             if palette == "chisel":
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \
-                    hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                    palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=1, legend=False, ax=axes[2*s])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 0, c] / base_nb_mean[:, c],
+                    hue=pd.Categorical(
+                        [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                        categories=np.arange(len(ordered_acn)),
+                        ordered=True,
+                    ),
+                    palette=seaborn.color_palette(colors),
+                    s=pointsize,
+                    edgecolor="black",
+                    linewidth=linewidth,
+                    alpha=1,
+                    legend=False,
+                    ax=axes[2 * s],
+                )
             else:
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \
-                    hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \
-                    palette=palette, s=pointsize, edgecolor="black", linewidth=linewidth, alpha=1, legend=False, ax=axes[2*s])
-            axes[2*s].set_ylabel(f"clone {cid}\nRDR")
-            axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1))
-            axes[2*s].set_ylim([0,rdr_ylim])
-            axes[2*s].set_xlim([0, n_obs])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 0, c] / base_nb_mean[:, c],
+                    hue=pd.Categorical(
+                        res_combine["pred_cnv"][:, c],
+                        categories=np.arange(n_states),
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    s=pointsize,
+                    edgecolor="black",
+                    linewidth=linewidth,
+                    alpha=1,
+                    legend=False,
+                    ax=axes[2 * s],
+                )
+            axes[2 * s].set_ylabel(f"clone {cid}\nRDR")
+            axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1))
+            axes[2 * s].set_ylim([0, rdr_ylim])
+            axes[2 * s].set_xlim([0, n_obs])
             if remove_xticks:
-                axes[2*s].set_xticks([])
+                axes[2 * s].set_xticks([])
             if palette == "chisel":
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                    palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                        categories=np.arange(len(ordered_acn)),
+                        ordered=True,
+                    ),
+                    palette=seaborn.color_palette(colors),
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[2 * s + 1],
+                )
             else:
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \
-                    palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1])
-            axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF")
-            axes[2*s+1].set_ylim([-0.1, 1.1])
-            axes[2*s+1].set_yticks([0, 0.5, 1])
-            axes[2*s+1].set_xlim([0, n_obs])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        res_combine["pred_cnv"][:, c],
+                        categories=np.arange(n_states),
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[2 * s + 1],
+                )
+            axes[2 * s + 1].set_ylabel(f"clone {cid}\nphased AF")
+            axes[2 * s + 1].set_ylim([-0.1, 1.1])
+            axes[2 * s + 1].set_yticks([0, 0.5, 1])
+            axes[2 * s + 1].set_xlim([0, n_obs])
             if remove_xticks:
-                axes[2*s+1].set_xticks([])
+                axes[2 * s + 1].set_xticks([])
             for i, seg in enumerate(segments):
-                axes[2*s].plot(seg, [np.exp(res_combine["new_log_mu"][labs[i],c]), np.exp(res_combine["new_log_mu"][labs[i],c])], c="black", linewidth=2)
-                axes[2*s+1].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
-                axes[2*s+1].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
+                axes[2 * s].plot(
+                    seg,
+                    [
+                        np.exp(res_combine["new_log_mu"][labs[i], c]),
+                        np.exp(res_combine["new_log_mu"][labs[i], c]),
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+                axes[2 * s + 1].plot(
+                    seg,
+                    [
+                        res_combine["new_p_binom"][labs[i], c],
+                        res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+                axes[2 * s + 1].plot(
+                    seg,
+                    [
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
 
         for i in range(len(lengths)):
-            median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45
-            axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform())
-            for k in range(2*len(nonempty_clones)):
+            median_len = (
+                np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45
+            )
+            axes[-1].text(
+                median_len - 5,
+                chrtext_shift,
+                unique_chrs[i],
+                transform=axes[-1].get_xaxis_transform(),
+            )
+            for k in range(2 * len(nonempty_clones)):
                 axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1)
         fig.tight_layout()
     # plot a given clone
     else:
-        fig, axes = plt.subplots(2*len(clone_ids), 1, figsize=(20, base_height*len(clone_ids)), dpi=200, facecolor="white")
-        for s,cid in enumerate(clone_ids):
+        fig, axes = plt.subplots(
+            2 * len(clone_ids),
+            1,
+            figsize=(20, base_height * len(clone_ids)),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, cid in enumerate(clone_ids):
             c = np.where(final_clone_ids == cid)[0][0]
 
             # major and minor allele copies give the hue
-            major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-            minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+            major = np.maximum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
+            minor = np.minimum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
 
             # plot points
-            segments, labs = get_intervals(res_combine["pred_cnv"][:,c])
+            segments, labs = get_intervals(res_combine["pred_cnv"][:, c])
             if palette == "chisel":
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \
-                    hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                    palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 0, c] / base_nb_mean[:, c],
+                    hue=pd.Categorical(
+                        [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                        categories=np.arange(len(ordered_acn)),
+                        ordered=True,
+                    ),
+                    palette=seaborn.color_palette(colors),
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[2 * s],
+                )
             else:
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \
-                    hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \
-                    palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s])
-            axes[2*s].set_ylabel(f"clone {cid}\nRDR" if clone_names is None else f"clone {clone_names[s]}\nRDR")
-            axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1))
-            axes[2*s].set_ylim([0,5])
-            axes[2*s].set_xlim([0, n_obs])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 0, c] / base_nb_mean[:, c],
+                    hue=pd.Categorical(
+                        res_combine["pred_cnv"][:, c],
+                        categories=np.arange(n_states),
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[2 * s],
+                )
+            axes[2 * s].set_ylabel(
+                f"clone {cid}\nRDR"
+                if clone_names is None
+                else f"clone {clone_names[s]}\nRDR"
+            )
+            axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1))
+            axes[2 * s].set_ylim([0, 5])
+            axes[2 * s].set_xlim([0, n_obs])
             if remove_xticks:
-                axes[2*s].set_xticks([])
+                axes[2 * s].set_xticks([])
             if palette == "chisel":
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                    palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                        categories=np.arange(len(ordered_acn)),
+                        ordered=True,
+                    ),
+                    palette=seaborn.color_palette(colors),
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[2 * s + 1],
+                )
             else:
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \
-                    palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1])
-            axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF" if clone_names is None else f"clone {clone_names[s]}\nphased AF")
-            axes[2*s+1].set_ylim([-0.1, 1.1])
-            axes[2*s+1].set_yticks([0, 0.5, 1])
-            axes[2*s+1].set_xlim([0, n_obs])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        res_combine["pred_cnv"][:, c],
+                        categories=np.arange(n_states),
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[2 * s + 1],
+                )
+            axes[2 * s + 1].set_ylabel(
+                f"clone {cid}\nphased AF"
+                if clone_names is None
+                else f"clone {clone_names[s]}\nphased AF"
+            )
+            axes[2 * s + 1].set_ylim([-0.1, 1.1])
+            axes[2 * s + 1].set_yticks([0, 0.5, 1])
+            axes[2 * s + 1].set_xlim([0, n_obs])
             if remove_xticks:
-                axes[2*s+1].set_xticks([])
+                axes[2 * s + 1].set_xticks([])
             for i, seg in enumerate(segments):
-                axes[2*s].plot(seg, [np.exp(res_combine["new_log_mu"][labs[i],c]), np.exp(res_combine["new_log_mu"][labs[i],c])], c="black", linewidth=2)
-                axes[2*s+1].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
-                axes[2*s+1].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
-        
+                axes[2 * s].plot(
+                    seg,
+                    [
+                        np.exp(res_combine["new_log_mu"][labs[i], c]),
+                        np.exp(res_combine["new_log_mu"][labs[i], c]),
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+                axes[2 * s + 1].plot(
+                    seg,
+                    [
+                        res_combine["new_p_binom"][labs[i], c],
+                        res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+                axes[2 * s + 1].plot(
+                    seg,
+                    [
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+
         for i in range(len(lengths)):
-            median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45
-            axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform())
-            for k in range(2*len(clone_ids)):
+            median_len = (
+                np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45
+            )
+            axes[-1].text(
+                median_len - 5,
+                chrtext_shift,
+                unique_chrs[i],
+                transform=axes[-1].get_xaxis_transform(),
+            )
+            for k in range(2 * len(clone_ids)):
                 axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1)
         fig.tight_layout()
 
     return fig
 
 
-
-def plot_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, clone_names=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=15, linewidth=1, palette="chisel"):
+def plot_baf(
+    configuration_file,
+    r_hmrf_initialization,
+    cn_file,
+    clone_ids=None,
+    clone_names=None,
+    remove_xticks=True,
+    rdr_ylim=5,
+    chrtext_shift=-0.3,
+    base_height=3.2,
+    pointsize=15,
+    linewidth=1,
+    palette="chisel",
+):
     # full palette
     chisel_palette, ordered_acn = get_full_palette()
-    map_cn = {x:i for i,x in enumerate(ordered_acn)}
+    map_cn = {x: i for i, x in enumerate(ordered_acn)}
     colors = [chisel_palette[c] for c in ordered_acn]
 
     try:
         config = read_configuration_file(configuration_file)
     except:
         config = read_joint_configuration_file(configuration_file)
-    
+
     # load allele specific integer copy numbers
     df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ])
-    if not '0' in final_clone_ids:
-        final_clone_ids = np.array(['0'] + list(final_clone_ids))
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]])
+    if not "0" in final_clone_ids:
+        final_clone_ids = np.array(["0"] + list(final_clone_ids))
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
     unique_chrs = np.unique(df_cnv.CHR.values)
 
     # load data
@@ -984,40 +2336,90 @@ def plot_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None,
     single_base_nb_mean = dat["single_base_nb_mean"]
     single_total_bb_RD = dat["single_total_bb_RD"]
     single_tumor_prop = dat["single_tumor_prop"]
-    res_combine = dict( np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True) )
+    res_combine = dict(
+        np.load(
+            f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+            allow_pickle=True,
+        )
+    )
 
     n_states = res_combine["new_p_binom"].shape[0]
 
     assert single_X.shape[0] == df_cnv.shape[0]
 
-    clone_index = [np.where(res_combine["new_assignment"] == c)[0] for c,cid in enumerate(final_clone_ids)]
+    clone_index = [
+        np.where(res_combine["new_assignment"] == c)[0]
+        for c, cid in enumerate(final_clone_ids)
+    ]
     if config["tumorprop_file"] is None:
-        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index)
+        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+            single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+        )
         tumor_prop = None
     else:
-        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop)
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+            single_X,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            clone_index,
+            single_tumor_prop,
+        )
     n_obs = X.shape[0]
     nonempty_clones = np.where(np.sum(total_bb_RD, axis=0) > 0)[0]
 
     # plotting all clones
     if clone_ids is None:
-        fig, axes = plt.subplots(len(nonempty_clones), 1, figsize=(20, base_height*len(nonempty_clones)), dpi=200, facecolor="white")
-        for s,c in enumerate(nonempty_clones):
+        fig, axes = plt.subplots(
+            len(nonempty_clones),
+            1,
+            figsize=(20, base_height * len(nonempty_clones)),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, c in enumerate(nonempty_clones):
             cid = final_clone_ids[c]
             # major and minor allele copies give the hue
-            major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-            minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+            major = np.maximum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
+            minor = np.minimum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
 
             # plot points
-            segments, labs = get_intervals(res_combine["pred_cnv"][:,c])
+            segments, labs = get_intervals(res_combine["pred_cnv"][:, c])
             if palette == "chisel":
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                    palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                        categories=np.arange(len(ordered_acn)),
+                        ordered=True,
+                    ),
+                    palette=seaborn.color_palette(colors),
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[s],
+                )
             else:
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \
-                    palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        res_combine["pred_cnv"][:, c],
+                        categories=np.arange(n_states),
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[s],
+                )
             axes[s].set_ylabel(f"clone {cid}\nphased AF")
             axes[s].set_ylim([-0.1, 1.1])
             axes[s].set_yticks([0, 0.5, 1])
@@ -1025,56 +2427,153 @@ def plot_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None,
             if remove_xticks:
                 axes[s].set_xticks([])
             for i, seg in enumerate(segments):
-                axes[s].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
-                axes[s].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
+                axes[s].plot(
+                    seg,
+                    [
+                        res_combine["new_p_binom"][labs[i], c],
+                        res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+                axes[s].plot(
+                    seg,
+                    [
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
 
         for i in range(len(lengths)):
-            median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45
-            axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform())
+            median_len = (
+                np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45
+            )
+            axes[-1].text(
+                median_len - 5,
+                chrtext_shift,
+                unique_chrs[i],
+                transform=axes[-1].get_xaxis_transform(),
+            )
             for k in range(len(nonempty_clones)):
                 axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1)
         fig.tight_layout()
     # plot a given clone
     else:
-        fig, axes = plt.subplots(2*len(clone_ids), 1, figsize=(20, base_height*len(clone_ids)), dpi=200, facecolor="white")
-        for s,cid in enumerate(clone_ids):
+        fig, axes = plt.subplots(
+            2 * len(clone_ids),
+            1,
+            figsize=(20, base_height * len(clone_ids)),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, cid in enumerate(clone_ids):
             c = np.where(final_clone_ids == cid)[0][0]
 
             # major and minor allele copies give the hue
-            major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-            minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+            major = np.maximum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
+            minor = np.minimum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
 
             # plot points
-            segments, labs = get_intervals(res_combine["pred_cnv"][:,c])
+            segments, labs = get_intervals(res_combine["pred_cnv"][:, c])
             if palette == "chisel":
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                    palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s])
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                        categories=np.arange(len(ordered_acn)),
+                        ordered=True,
+                    ),
+                    palette=seaborn.color_palette(colors),
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[s],
+                )
             else:
-                seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \
-                    hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \
-                    palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s])
-            axes[s].set_ylabel(f"clone {cid}\nphased AF" if clone_names is None else f"clone {clone_names[s]}\nphased AF")
+                seaborn.scatterplot(
+                    x=np.arange(X[:, 1, c].shape[0]),
+                    y=X[:, 1, c] / total_bb_RD[:, c],
+                    hue=pd.Categorical(
+                        res_combine["pred_cnv"][:, c],
+                        categories=np.arange(n_states),
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    s=pointsize,
+                    edgecolor="black",
+                    alpha=0.8,
+                    legend=False,
+                    ax=axes[s],
+                )
+            axes[s].set_ylabel(
+                f"clone {cid}\nphased AF"
+                if clone_names is None
+                else f"clone {clone_names[s]}\nphased AF"
+            )
             axes[s].set_ylim([-0.1, 1.1])
             axes[s].set_yticks([0, 0.5, 1])
             axes[s].set_xlim([0, n_obs])
             if remove_xticks:
                 axes[s].set_xticks([])
             for i, seg in enumerate(segments):
-                axes[s].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
-                axes[s].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2)
-        
+                axes[s].plot(
+                    seg,
+                    [
+                        res_combine["new_p_binom"][labs[i], c],
+                        res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+                axes[s].plot(
+                    seg,
+                    [
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                        1 - res_combine["new_p_binom"][labs[i], c],
+                    ],
+                    c="black",
+                    linewidth=2,
+                )
+
         for i in range(len(lengths)):
-            median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45
-            axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform())
-            for k in range(2*len(clone_ids)):
+            median_len = (
+                np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45
+            )
+            axes[-1].text(
+                median_len - 5,
+                chrtext_shift,
+                unique_chrs[i],
+                transform=axes[-1].get_xaxis_transform(),
+            )
+            for k in range(2 * len(clone_ids)):
                 axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1)
         fig.tight_layout()
 
     return fig
 
 
-def plot_rdr_baf_from_df(df, clone_ids=None, clone_names=None, base_height=3.2, rdr_ylim=3, baf_ylim=0.5, baf_yticks=None, linewidth=0, pointsize=30, chrtext_shift=-0.3, add_legend=False, remove_xticks=True):
+def plot_rdr_baf_from_df(
+    df,
+    clone_ids=None,
+    clone_names=None,
+    base_height=3.2,
+    rdr_ylim=3,
+    baf_ylim=0.5,
+    baf_yticks=None,
+    linewidth=0,
+    pointsize=30,
+    chrtext_shift=-0.3,
+    add_legend=False,
+    remove_xticks=True,
+):
     """
     Attributes
     ----------
@@ -1083,127 +2582,256 @@ def plot_rdr_baf_from_df(df, clone_ids=None, clone_names=None, base_height=3.2,
     """
     # full palette
     chisel_palette, ordered_acn = get_full_palette()
-    map_cn = {x:i for i,x in enumerate(ordered_acn)}
+    map_cn = {x: i for i, x in enumerate(ordered_acn)}
     colors = [chisel_palette[c] for c in ordered_acn]
-    
+
     # load allele specific integer copy numbers
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df.columns if "RD" in x ])
-    assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df.columns if "RD" in x])
+    assert (clone_ids is None) or np.all(
+        [(cid in final_clone_ids) for cid in clone_ids]
+    )
     unique_chrs = np.unique(df.CHR.values)
 
     if clone_ids is None:
-        fig, axes = plt.subplots(2*len(final_clone_ids), 1, figsize=(20, base_height*len(final_clone_ids)), dpi=200, facecolor="white")
-        for s,cid in enumerate(final_clone_ids):
+        fig, axes = plt.subplots(
+            2 * len(final_clone_ids),
+            1,
+            figsize=(20, base_height * len(final_clone_ids)),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, cid in enumerate(final_clone_ids):
             # major and minor allele copies give the hue
             major = np.maximum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values)
             minor = np.minimum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values)
-            
-            seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f'clone{cid} RD'].values, \
-                hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s])
-            axes[2*s].set_ylabel(f"clone {cid}\nRDR")
-            axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1))
-            axes[2*s].set_ylim([0,rdr_ylim])
-            axes[2*s].set_xlim([0, df.shape[0]])
+
+            seaborn.scatterplot(
+                x=np.arange(df.shape[0]),
+                y=df[f"clone{cid} RD"].values,
+                hue=pd.Categorical(
+                    [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                    categories=np.arange(len(ordered_acn)),
+                    ordered=True,
+                ),
+                palette=seaborn.color_palette(colors),
+                s=pointsize,
+                edgecolor="black",
+                linewidth=linewidth,
+                alpha=0.8,
+                legend=False,
+                ax=axes[2 * s],
+            )
+            axes[2 * s].set_ylabel(f"clone {cid}\nRDR")
+            axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1))
+            axes[2 * s].set_ylim([0, rdr_ylim])
+            axes[2 * s].set_xlim([0, df.shape[0]])
             if remove_xticks:
-                axes[2*s].set_xticks([])
-            seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f"clone{cid} BAF"].values, \
-                hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s+1])
-            axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF")
-            axes[2*s+1].set_ylim([-0.1, baf_ylim])
+                axes[2 * s].set_xticks([])
+            seaborn.scatterplot(
+                x=np.arange(df.shape[0]),
+                y=df[f"clone{cid} BAF"].values,
+                hue=pd.Categorical(
+                    [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                    categories=np.arange(len(ordered_acn)),
+                    ordered=True,
+                ),
+                palette=seaborn.color_palette(colors),
+                s=pointsize,
+                edgecolor="black",
+                linewidth=linewidth,
+                alpha=0.8,
+                legend=False,
+                ax=axes[2 * s + 1],
+            )
+            axes[2 * s + 1].set_ylabel(f"clone {cid}\nphased AF")
+            axes[2 * s + 1].set_ylim([-0.1, baf_ylim])
             if baf_yticks is None:
-                axes[2*s+1].set_yticks(np.arange(0, baf_ylim, 0.1))
+                axes[2 * s + 1].set_yticks(np.arange(0, baf_ylim, 0.1))
             else:
-                axes[2*s+1].set_yticks(baf_yticks)
-            axes[2*s+1].set_xlim([0, df.shape[0]])
+                axes[2 * s + 1].set_yticks(baf_yticks)
+            axes[2 * s + 1].set_xlim([0, df.shape[0]])
             if remove_xticks:
-                axes[2*s+1].set_xticks([])
+                axes[2 * s + 1].set_xticks([])
 
         for i in unique_chrs:
             median_len = np.percentile(np.where(df.CHR.values == i)[0], 45)
             max_len = np.max(np.where(df.CHR.values == i)[0])
-            axes[-1].text(median_len-5, chrtext_shift, i, transform=axes[-1].get_xaxis_transform())
+            axes[-1].text(
+                median_len - 5,
+                chrtext_shift,
+                i,
+                transform=axes[-1].get_xaxis_transform(),
+            )
             if max_len + 1 < df.shape[0]:
-                for k in range(2*len(final_clone_ids)):
+                for k in range(2 * len(final_clone_ids)):
                     axes[k].axvline(x=max_len, c="grey", linewidth=1)
     # plot a given clone
     else:
-        fig, axes = plt.subplots(2*len(clone_ids), 1, figsize=(20, base_height*len(clone_ids)), dpi=200, facecolor="white")
-        for s,cid in enumerate(clone_ids):
+        fig, axes = plt.subplots(
+            2 * len(clone_ids),
+            1,
+            figsize=(20, base_height * len(clone_ids)),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, cid in enumerate(clone_ids):
             # major and minor allele copies give the hue
             major = np.maximum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values)
             minor = np.minimum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values)
 
             # plot points
-            seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f'clone{cid} RD'].values, \
-                hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s])
-            axes[2*s].set_ylabel(f"clone {cid}\nRDR" if clone_names is None else f"clone {clone_names[s]}\nRDR")
-            axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1))
-            axes[2*s].set_ylim([0,rdr_ylim])
-            axes[2*s].set_xlim([0, df.shape[0]])
+            seaborn.scatterplot(
+                x=np.arange(df.shape[0]),
+                y=df[f"clone{cid} RD"].values,
+                hue=pd.Categorical(
+                    [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                    categories=np.arange(len(ordered_acn)),
+                    ordered=True,
+                ),
+                palette=seaborn.color_palette(colors),
+                s=pointsize,
+                edgecolor="black",
+                linewidth=linewidth,
+                alpha=0.8,
+                legend=False,
+                ax=axes[2 * s],
+            )
+            axes[2 * s].set_ylabel(
+                f"clone {cid}\nRDR"
+                if clone_names is None
+                else f"clone {clone_names[s]}\nRDR"
+            )
+            axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1))
+            axes[2 * s].set_ylim([0, rdr_ylim])
+            axes[2 * s].set_xlim([0, df.shape[0]])
             if remove_xticks:
-                axes[2*s].set_xticks([])
-            seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f'clone{cid} BAF'].values, \
-                hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s+1])
-            axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF" if clone_names is None else f"clone {clone_names[s]}\nphased AF")
-            axes[2*s+1].set_ylim([-0.1, baf_ylim])
+                axes[2 * s].set_xticks([])
+            seaborn.scatterplot(
+                x=np.arange(df.shape[0]),
+                y=df[f"clone{cid} BAF"].values,
+                hue=pd.Categorical(
+                    [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                    categories=np.arange(len(ordered_acn)),
+                    ordered=True,
+                ),
+                palette=seaborn.color_palette(colors),
+                s=pointsize,
+                edgecolor="black",
+                linewidth=linewidth,
+                alpha=0.8,
+                legend=False,
+                ax=axes[2 * s + 1],
+            )
+            axes[2 * s + 1].set_ylabel(
+                f"clone {cid}\nphased AF"
+                if clone_names is None
+                else f"clone {clone_names[s]}\nphased AF"
+            )
+            axes[2 * s + 1].set_ylim([-0.1, baf_ylim])
             if baf_yticks is None:
-                axes[2*s+1].set_yticks(np.arange(0, baf_ylim, 0.1))
+                axes[2 * s + 1].set_yticks(np.arange(0, baf_ylim, 0.1))
             else:
-                axes[2*s+1].set_yticks(baf_yticks)
-            axes[2*s+1].set_xlim([0, df.shape[0]])
+                axes[2 * s + 1].set_yticks(baf_yticks)
+            axes[2 * s + 1].set_xlim([0, df.shape[0]])
             if remove_xticks:
-                axes[2*s+1].set_xticks([])
-        
+                axes[2 * s + 1].set_xticks([])
+
         for i in unique_chrs:
             median_len = np.percentile(np.where(df.CHR.values == i)[0], 45)
             max_len = np.max(np.where(df.CHR.values == i)[0])
-            axes[-1].text(median_len-5, chrtext_shift, i, transform=axes[-1].get_xaxis_transform())
+            axes[-1].text(
+                median_len - 5,
+                chrtext_shift,
+                i,
+                transform=axes[-1].get_xaxis_transform(),
+            )
             if max_len + 1 < df.shape[0]:
-                for k in range(2*len(clone_ids)):
+                for k in range(2 * len(clone_ids)):
                     axes[k].axvline(x=max_len, c="grey", linewidth=1)
 
     if add_legend:
-        a00 = plt.arrow(0,0, 0,0, 
-        color='darkblue')
-        a10 = plt.arrow(0,0, 0,0, color='lightblue')
-        a11 = plt.arrow(0,0, 0,0, color='lightgray')
-        a20 = plt.arrow(0,0, 0,0, color='dimgray')
-        a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow')
-        a30 = plt.arrow(0,0, 0,0, color='gold')
-        a22 = plt.arrow(0,0, 0,0, color='navajowhite')
-        a31 = plt.arrow(0,0, 0,0, color='orange')
-        a40 = plt.arrow(0,0, 0,0, color='darkorange')
-        a32 = plt.arrow(0,0, 0,0, color='salmon')
-        a41 = plt.arrow(0,0, 0,0, color='red')
-        a50 = plt.arrow(0,0, 0,0, color='darkred')
-        a33 = plt.arrow(0,0, 0,0, color='plum')
-        a42 = plt.arrow(0,0, 0,0, color='orchid')
-        a51 = plt.arrow(0,0, 0,0, color='purple')
-        a60 = plt.arrow(0,0, 0,0, color='indigo')
-        axes[0].legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \
-        ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \
-        '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1))
+        a00 = plt.arrow(0, 0, 0, 0, color="darkblue")
+        a10 = plt.arrow(0, 0, 0, 0, color="lightblue")
+        a11 = plt.arrow(0, 0, 0, 0, color="lightgray")
+        a20 = plt.arrow(0, 0, 0, 0, color="dimgray")
+        a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow")
+        a30 = plt.arrow(0, 0, 0, 0, color="gold")
+        a22 = plt.arrow(0, 0, 0, 0, color="navajowhite")
+        a31 = plt.arrow(0, 0, 0, 0, color="orange")
+        a40 = plt.arrow(0, 0, 0, 0, color="darkorange")
+        a32 = plt.arrow(0, 0, 0, 0, color="salmon")
+        a41 = plt.arrow(0, 0, 0, 0, color="red")
+        a50 = plt.arrow(0, 0, 0, 0, color="darkred")
+        a33 = plt.arrow(0, 0, 0, 0, color="plum")
+        a42 = plt.arrow(0, 0, 0, 0, color="orchid")
+        a51 = plt.arrow(0, 0, 0, 0, color="purple")
+        a60 = plt.arrow(0, 0, 0, 0, color="indigo")
+        axes[0].legend(
+            [
+                a00,
+                a10,
+                a11,
+                a20,
+                a21,
+                a30,
+                a22,
+                a31,
+                a40,
+                a32,
+                a41,
+                a50,
+                a33,
+                a42,
+                a51,
+                a60,
+            ],
+            [
+                "(0, 0)",
+                "(1, 0)",
+                "(1, 1)",
+                "(2, 0)",
+                "(2, 1)",
+                "(3, 0)",
+                "(2, 2)",
+                "(3, 1)",
+                "(4, 0)",
+                "(3, 2)",
+                "(4, 1)",
+                "(5, 0)",
+                "(3, 3)",
+                "(4, 2)",
+                "(5, 1)",
+                "(6, 0)",
+            ],
+            ncol=2,
+            loc="upper left",
+            bbox_to_anchor=(1, 1),
+        )
 
     fig.tight_layout()
     fig.subplots_adjust(hspace=0.1)
     return fig, axes
 
 
-def plot_2dscatter_rdrbaf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, rdr_ylim=5, base_width=3.2, pointsize=15):
+def plot_2dscatter_rdrbaf(
+    configuration_file,
+    r_hmrf_initialization,
+    cn_file,
+    clone_ids=None,
+    rdr_ylim=5,
+    base_width=3.2,
+    pointsize=15,
+):
     # full palette
     palette, ordered_acn = get_full_palette()
-    map_cn = {x:i for i,x in enumerate(ordered_acn)}
+    map_cn = {x: i for i, x in enumerate(ordered_acn)}
     colors = [palette[c] for c in ordered_acn]
 
     try:
         config = read_configuration_file(configuration_file)
     except:
         config = read_joint_configuration_file(configuration_file)
-    
+
     # load allele specific integer copy numbers
     df_cnv = pd.read_csv(cn_file, header=0, sep="\t")
     n_final_clones = int(df_cnv.columns[-1].split(" ")[0][5:]) + 1
@@ -1218,61 +2846,131 @@ def plot_2dscatter_rdrbaf(configuration_file, r_hmrf_initialization, cn_file, cl
     single_base_nb_mean = dat["single_base_nb_mean"]
     single_total_bb_RD = dat["single_total_bb_RD"]
     single_tumor_prop = dat["single_tumor_prop"]
-    res_combine = dict( np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True) )
+    res_combine = dict(
+        np.load(
+            f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
+            allow_pickle=True,
+        )
+    )
 
     assert single_X.shape[0] == df_cnv.shape[0]
 
-    clone_index = [np.where(res_combine["new_assignment"] == c)[0] for c in range(len( np.unique(res_combine["new_assignment"]) ))]
+    clone_index = [
+        np.where(res_combine["new_assignment"] == c)[0]
+        for c in range(len(np.unique(res_combine["new_assignment"])))
+    ]
     if config["tumorprop_file"] is None:
-        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index)
+        X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
+            single_X, single_base_nb_mean, single_total_bb_RD, clone_index
+        )
         tumor_prop = None
     else:
-        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop)
+        X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
+            single_X,
+            single_base_nb_mean,
+            single_total_bb_RD,
+            clone_index,
+            single_tumor_prop,
+        )
     n_obs = X.shape[0]
 
     # plotting all clones
     if clone_ids is None:
-        fig, axes = plt.subplots(1, X.shape[2], figsize=(base_width*X.shape[2], base_width), dpi=200, facecolor="white")
+        fig, axes = plt.subplots(
+            1,
+            X.shape[2],
+            figsize=(base_width * X.shape[2], base_width),
+            dpi=200,
+            facecolor="white",
+        )
         for s in range(X.shape[2]):
             # major and minor allele copies give the hue
-            major = np.maximum(df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values)
-            minor = np.minimum(df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values)
+            major = np.maximum(
+                df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values
+            )
+            minor = np.minimum(
+                df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values
+            )
 
             # plot points
-            seaborn.scatterplot(x=X[:,1,s]/total_bb_RD[:,s], y=X[:,0,s]/base_nb_mean[:,s], \
-                hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s])
+            seaborn.scatterplot(
+                x=X[:, 1, s] / total_bb_RD[:, s],
+                y=X[:, 0, s] / base_nb_mean[:, s],
+                hue=pd.Categorical(
+                    [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                    categories=np.arange(len(ordered_acn)),
+                    ordered=True,
+                ),
+                palette=seaborn.color_palette(colors),
+                s=pointsize,
+                edgecolor="black",
+                alpha=0.8,
+                legend=False,
+                ax=axes[s],
+            )
             axes[s].set_xlabel(f"clone {s}\nphased AF")
             axes[s].set_xlim([-0.1, 1.1])
             axes[s].set_xticks([0, 0.5, 1])
             axes[s].set_ylabel(f"clone {s}\nRDR")
             axes[s].set_yticks(np.arange(1, rdr_ylim, 1))
-            axes[s].set_ylim([0,5])
+            axes[s].set_ylim([0, 5])
         fig.tight_layout()
     # plot a given clone
     else:
-        fig, axes = plt.subplots(1, len(clone_ids), figsize=(base_width*len(clone_ids), base_width), dpi=200, facecolor="white")
-        for s,cid in enumerate(clone_ids):
+        fig, axes = plt.subplots(
+            1,
+            len(clone_ids),
+            figsize=(base_width * len(clone_ids), base_width),
+            dpi=200,
+            facecolor="white",
+        )
+        for s, cid in enumerate(clone_ids):
             # major and minor allele copies give the hue
-            major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
-            minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values)
+            major = np.maximum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
+            minor = np.minimum(
+                df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values
+            )
 
             # plot points
-            seaborn.scatterplot(x=X[:,1,cid]/total_bb_RD[:,cid], y=X[:,0,cid]/base_nb_mean[:,cid], \
-                hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-                palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s])
+            seaborn.scatterplot(
+                x=X[:, 1, cid] / total_bb_RD[:, cid],
+                y=X[:, 0, cid] / base_nb_mean[:, cid],
+                hue=pd.Categorical(
+                    [map_cn[(major[i], minor[i])] for i in range(len(major))],
+                    categories=np.arange(len(ordered_acn)),
+                    ordered=True,
+                ),
+                palette=seaborn.color_palette(colors),
+                s=pointsize,
+                edgecolor="black",
+                alpha=0.8,
+                legend=False,
+                ax=axes[s],
+            )
             axes[s].set_xlabel(f"clone {cid}\nphased AF")
             axes[s].set_xlim([-0.1, 1.1])
             axes[s].set_xticks([0, 0.5, 1])
             axes[s].set_ylabel(f"clone {cid}\nRDR")
             axes[s].set_yticks(np.arange(1, rdr_ylim, 1))
-            axes[s].set_ylim([0,5])
+            axes[s].set_ylim([0, 5])
         fig.tight_layout()
 
     return fig
 
 
-def plot_2dscatter_rdrbaf_from_df(df, axes, cid, cname=None, baf_xlim=0.51, rdr_ylim=3, pointsize=15, linewidth=1, add_legend=False):
+def plot_2dscatter_rdrbaf_from_df(
+    df,
+    axes,
+    cid,
+    cname=None,
+    baf_xlim=0.51,
+    rdr_ylim=3,
+    pointsize=15,
+    linewidth=1,
+    add_legend=False,
+):
     """
     Attributes
     ----------
@@ -1281,10 +2979,10 @@ def plot_2dscatter_rdrbaf_from_df(df, axes, cid, cname=None, baf_xlim=0.51, rdr_
     """
     # full palette
     palette, ordered_acn = get_full_palette()
-    map_cn = {x:i for i,x in enumerate(ordered_acn)}
+    map_cn = {x: i for i, x in enumerate(ordered_acn)}
     colors = [palette[c] for c in ordered_acn]
 
-    final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df.columns if "RD" in x ])
+    final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df.columns if "RD" in x])
     assert cid in final_clone_ids
     unique_chrs = np.unique(df.CHR.values)
 
@@ -1293,90 +2991,211 @@ def plot_2dscatter_rdrbaf_from_df(df, axes, cid, cname=None, baf_xlim=0.51, rdr_
     minor = np.minimum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values)
 
     # plot points
-    seaborn.scatterplot(x=df[f'clone{cid} BAF'].values, y=df[f'clone{cid} RD'].values, \
-        hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \
-        palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes)
-    axes.set_xlabel(f"clone {cid}\nphased AF" if cname is None else f"{cname}\nphased AF")
+    seaborn.scatterplot(
+        x=df[f"clone{cid} BAF"].values,
+        y=df[f"clone{cid} RD"].values,
+        hue=pd.Categorical(
+            [map_cn[(major[i], minor[i])] for i in range(len(major))],
+            categories=np.arange(len(ordered_acn)),
+            ordered=True,
+        ),
+        palette=seaborn.color_palette(colors),
+        s=pointsize,
+        edgecolor="black",
+        linewidth=linewidth,
+        alpha=0.8,
+        legend=False,
+        ax=axes,
+    )
+    axes.set_xlabel(
+        f"clone {cid}\nphased AF" if cname is None else f"{cname}\nphased AF"
+    )
     axes.set_xlim([-0.02, baf_xlim])
     axes.set_xticks(np.arange(0, baf_xlim, 0.1))
     axes.set_ylabel(f"clone {cid}\nRDR" if cname is None else f"{cname}\nRDR")
     axes.set_yticks(np.arange(1, rdr_ylim, 1))
-    axes.set_ylim([0,rdr_ylim])
+    axes.set_ylim([0, rdr_ylim])
 
     if add_legend:
-        a00 = plt.arrow(0,0, 0,0, 
-        color='darkblue')
-        a10 = plt.arrow(0,0, 0,0, color='lightblue')
-        a11 = plt.arrow(0,0, 0,0, color='lightgray')
-        a20 = plt.arrow(0,0, 0,0, color='dimgray')
-        a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow')
-        a30 = plt.arrow(0,0, 0,0, color='gold')
-        a22 = plt.arrow(0,0, 0,0, color='navajowhite')
-        a31 = plt.arrow(0,0, 0,0, color='orange')
-        a40 = plt.arrow(0,0, 0,0, color='darkorange')
-        a32 = plt.arrow(0,0, 0,0, color='salmon')
-        a41 = plt.arrow(0,0, 0,0, color='red')
-        a50 = plt.arrow(0,0, 0,0, color='darkred')
-        a33 = plt.arrow(0,0, 0,0, color='plum')
-        a42 = plt.arrow(0,0, 0,0, color='orchid')
-        a51 = plt.arrow(0,0, 0,0, color='purple')
-        a60 = plt.arrow(0,0, 0,0, color='indigo')
-        axes.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \
-        ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \
-        '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1))
-
-
-
-def plot_clones_in_space(coords, assignment, sample_list=None, sample_ids=None, palette="Set2", labels=None, label_coords=None, label_sample_ids=None):
+        a00 = plt.arrow(0, 0, 0, 0, color="darkblue")
+        a10 = plt.arrow(0, 0, 0, 0, color="lightblue")
+        a11 = plt.arrow(0, 0, 0, 0, color="lightgray")
+        a20 = plt.arrow(0, 0, 0, 0, color="dimgray")
+        a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow")
+        a30 = plt.arrow(0, 0, 0, 0, color="gold")
+        a22 = plt.arrow(0, 0, 0, 0, color="navajowhite")
+        a31 = plt.arrow(0, 0, 0, 0, color="orange")
+        a40 = plt.arrow(0, 0, 0, 0, color="darkorange")
+        a32 = plt.arrow(0, 0, 0, 0, color="salmon")
+        a41 = plt.arrow(0, 0, 0, 0, color="red")
+        a50 = plt.arrow(0, 0, 0, 0, color="darkred")
+        a33 = plt.arrow(0, 0, 0, 0, color="plum")
+        a42 = plt.arrow(0, 0, 0, 0, color="orchid")
+        a51 = plt.arrow(0, 0, 0, 0, color="purple")
+        a60 = plt.arrow(0, 0, 0, 0, color="indigo")
+        axes.legend(
+            [
+                a00,
+                a10,
+                a11,
+                a20,
+                a21,
+                a30,
+                a22,
+                a31,
+                a40,
+                a32,
+                a41,
+                a50,
+                a33,
+                a42,
+                a51,
+                a60,
+            ],
+            [
+                "(0, 0)",
+                "(1, 0)",
+                "(1, 1)",
+                "(2, 0)",
+                "(2, 1)",
+                "(3, 0)",
+                "(2, 2)",
+                "(3, 1)",
+                "(4, 0)",
+                "(3, 2)",
+                "(4, 1)",
+                "(5, 0)",
+                "(3, 3)",
+                "(4, 2)",
+                "(5, 1)",
+                "(6, 0)",
+            ],
+            ncol=2,
+            loc="upper left",
+            bbox_to_anchor=(1, 1),
+        )
+
+
+def plot_clones_in_space(
+    coords,
+    assignment,
+    sample_list=None,
+    sample_ids=None,
+    palette="Set2",
+    labels=None,
+    label_coords=None,
+    label_sample_ids=None,
+):
     if (sample_list is None) or (len(sample_list) == 1):
-        fig, axes = plt.subplots(1, 1, figsize=(5.5,4), dpi=200, facecolor="white")
-        seaborn.scatterplot(x=coords[:,0], y=-coords[:,1], color="lightgrey", alpha=0.5, linewidth=0, s=15, ax=axes)
-        seaborn.scatterplot(x=coords[~assignment.isnull(),0], y=-coords[~assignment.isnull(),1], \
-                            hue=assignment[~assignment.isnull()], palette=palette, linewidth=0, s=15, ax=axes)
-        h,l = axes.get_legend_handles_labels()
-        axes.legend(h, l, loc="upper left", bbox_to_anchor=(1,1))
+        fig, axes = plt.subplots(1, 1, figsize=(5.5, 4), dpi=200, facecolor="white")
+        seaborn.scatterplot(
+            x=coords[:, 0],
+            y=-coords[:, 1],
+            color="lightgrey",
+            alpha=0.5,
+            linewidth=0,
+            s=15,
+            ax=axes,
+        )
+        seaborn.scatterplot(
+            x=coords[~assignment.isnull(), 0],
+            y=-coords[~assignment.isnull(), 1],
+            hue=assignment[~assignment.isnull()],
+            palette=palette,
+            linewidth=0,
+            s=15,
+            ax=axes,
+        )
+        h, l = axes.get_legend_handles_labels()
+        axes.legend(h, l, loc="upper left", bbox_to_anchor=(1, 1))
 
         if not labels is None:
             assert len(labels) == len(label_coords)
-            for i,c in enumerate(labels):
-                axes.text(label_coords[i][0]-4, -label_coords[i][1], c)
+            for i, c in enumerate(labels):
+                axes.text(label_coords[i][0] - 4, -label_coords[i][1], c)
     else:
         unique_assignments = np.sort(np.unique(assignment[~assignment.isnull()].values))
-        fig, axes = plt.subplots(1, len(sample_list), figsize=(5*len(sample_list)+0.5,4), dpi=200, facecolor="white")
+        fig, axes = plt.subplots(
+            1,
+            len(sample_list),
+            figsize=(5 * len(sample_list) + 0.5, 4),
+            dpi=200,
+            facecolor="white",
+        )
         for s, sname in enumerate(sample_list):
             indexes = np.where(sample_ids == s)[0]
-            seaborn.scatterplot(x=coords[indexes,0], y=-coords[indexes,1], color="lightgrey", alpha=0.5, linewidth=0, s=15, ax=axes[s])
+            seaborn.scatterplot(
+                x=coords[indexes, 0],
+                y=-coords[indexes, 1],
+                color="lightgrey",
+                alpha=0.5,
+                linewidth=0,
+                s=15,
+                ax=axes[s],
+            )
             if s + 1 != len(sample_list):
-                seaborn.scatterplot(x=coords[indexes,0][~assignment.iloc[indexes].isnull()], y=-coords[indexes,1][~assignment.iloc[indexes].isnull()], \
-                                hue=pd.Categorical(assignment.iloc[indexes][~assignment.iloc[indexes].isnull()], categories=unique_assignments, ordered=True), \
-                                palette=palette, linewidth=0, s=15, legend=False, ax=axes[s])
+                seaborn.scatterplot(
+                    x=coords[indexes, 0][~assignment.iloc[indexes].isnull()],
+                    y=-coords[indexes, 1][~assignment.iloc[indexes].isnull()],
+                    hue=pd.Categorical(
+                        assignment.iloc[indexes][~assignment.iloc[indexes].isnull()],
+                        categories=unique_assignments,
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    linewidth=0,
+                    s=15,
+                    legend=False,
+                    ax=axes[s],
+                )
             else:
-                seaborn.scatterplot(x=coords[indexes,0][~assignment.iloc[indexes].isnull()], y=-coords[indexes,1][~assignment.iloc[indexes].isnull()], \
-                                hue=pd.Categorical(assignment.iloc[indexes][~assignment.iloc[indexes].isnull()], categories=unique_assignments, ordered=True), \
-                                palette=palette, linewidth=0, s=15, ax=axes[s])
-                h,l = axes[s].get_legend_handles_labels()
-                axes[s].legend(h, l, loc="upper left", bbox_to_anchor=(1,1))
+                seaborn.scatterplot(
+                    x=coords[indexes, 0][~assignment.iloc[indexes].isnull()],
+                    y=-coords[indexes, 1][~assignment.iloc[indexes].isnull()],
+                    hue=pd.Categorical(
+                        assignment.iloc[indexes][~assignment.iloc[indexes].isnull()],
+                        categories=unique_assignments,
+                        ordered=True,
+                    ),
+                    palette=palette,
+                    linewidth=0,
+                    s=15,
+                    ax=axes[s],
+                )
+                h, l = axes[s].get_legend_handles_labels()
+                axes[s].legend(h, l, loc="upper left", bbox_to_anchor=(1, 1))
 
         if not labels is None:
-            assert len(labels) == len(label_coords) and len(labels) == len(label_sample_ids)
-            for i,c in enumerate(labels):
+            assert len(labels) == len(label_coords) and len(labels) == len(
+                label_sample_ids
+            )
+            for i, c in enumerate(labels):
                 s = label_sample_ids[i]
-                axes[s].text(label_coords[i][0]-4, -label_coords[i][1], c)
+                axes[s].text(label_coords[i][0] - 4, -label_coords[i][1], c)
 
     fig.tight_layout()
 
     return fig
 
 
-def plot_individual_spots_in_space(coords, assignment, single_tumor_prop=None, sample_list=None, sample_ids=None, base_width=4, base_height=3, palette="Set2"):
+def plot_individual_spots_in_space(
+    coords,
+    assignment,
+    single_tumor_prop=None,
+    sample_list=None,
+    sample_ids=None,
+    base_width=4,
+    base_height=3,
+    palette="Set2",
+):
     # combine coordinates across samples
     shifted_coords = copy.copy(coords)
     if not (sample_ids is None):
         x_offset = 0
-        for s,sname in enumerate(sample_list):
+        for s, sname in enumerate(sample_list):
             index = np.where(sample_ids == s)[0]
-            shifted_coords[index,0] = shifted_coords[index,0] + x_offset
-            x_offset += np.max(coords[index,0]) + 10
+            shifted_coords[index, 0] = shifted_coords[index, 0] + x_offset
+            x_offset += np.max(coords[index, 0]) + 10
 
     # number of clones and samples
     final_clone_ids = np.unique(assignment[~assignment.isnull()].values)
@@ -1387,27 +3206,80 @@ def plot_individual_spots_in_space(coords, assignment, single_tumor_prop=None, s
     if not single_tumor_prop is None:
         copy_single_tumor_prop = copy.copy(single_tumor_prop)
         copy_single_tumor_prop[np.isnan(copy_single_tumor_prop)] = 0.5
-    
-    fig, axes = plt.subplots(1, 1, figsize=(base_width*n_samples, base_height), dpi=200, facecolor="white")
+
+    fig, axes = plt.subplots(
+        1, 1, figsize=(base_width * n_samples, base_height), dpi=200, facecolor="white"
+    )
     if "clone 0" in final_clone_ids:
-        colorlist = ['lightgrey'] + seaborn.color_palette("Set2", n_final_clones-1).as_hex()
+        colorlist = ["lightgrey"] + seaborn.color_palette(
+            "Set2", n_final_clones - 1
+        ).as_hex()
     else:
         colorlist = seaborn.color_palette("Set2", n_final_clones).as_hex()
 
-    for c,cid in enumerate(final_clone_ids):
-        idx = np.where( (assignment.values==cid) )[0]
+    for c, cid in enumerate(final_clone_ids):
+        idx = np.where((assignment.values == cid))[0]
         if single_tumor_prop is None:
-            seaborn.scatterplot(x=shifted_coords[idx,0], y=-shifted_coords[idx,1], s=10, color=colorlist[c], linewidth=0, legend=None, ax=axes)
+            seaborn.scatterplot(
+                x=shifted_coords[idx, 0],
+                y=-shifted_coords[idx, 1],
+                s=10,
+                color=colorlist[c],
+                linewidth=0,
+                legend=None,
+                ax=axes,
+            )
         else:
             # cmap
-            this_full_cmap = seaborn.color_palette(f"blend:lightgrey,{colorlist[c]}", as_cmap=True)
-            quantile_colors = this_full_cmap(np.array([0, np.min(copy_single_tumor_prop[idx]), np.max(copy_single_tumor_prop[idx]), 1]))
-            quantile_colors = [matplotlib.colors.rgb2hex(x) for x in quantile_colors[1:-1]]
-            this_cmap = seaborn.color_palette(f"blend:{quantile_colors[0]},{quantile_colors[-1]}", as_cmap=True)
-            seaborn.scatterplot(x=shifted_coords[idx,0], y=-shifted_coords[idx,1], s=10, hue=copy_single_tumor_prop[idx], palette=this_cmap, linewidth=0, legend=None, ax=axes)
-
-    legend_elements = [Line2D([0], [0], marker='o', color="w", markerfacecolor=colorlist[c], label=cid, markersize=10) for c,cid in enumerate(final_clone_ids)]
-    axes.legend(legend_elements, final_clone_ids, handlelength=0.1, loc="upper left", bbox_to_anchor=(1,1))
+            this_full_cmap = seaborn.color_palette(
+                f"blend:lightgrey,{colorlist[c]}", as_cmap=True
+            )
+            quantile_colors = this_full_cmap(
+                np.array(
+                    [
+                        0,
+                        np.min(copy_single_tumor_prop[idx]),
+                        np.max(copy_single_tumor_prop[idx]),
+                        1,
+                    ]
+                )
+            )
+            quantile_colors = [
+                matplotlib.colors.rgb2hex(x) for x in quantile_colors[1:-1]
+            ]
+            this_cmap = seaborn.color_palette(
+                f"blend:{quantile_colors[0]},{quantile_colors[-1]}", as_cmap=True
+            )
+            seaborn.scatterplot(
+                x=shifted_coords[idx, 0],
+                y=-shifted_coords[idx, 1],
+                s=10,
+                hue=copy_single_tumor_prop[idx],
+                palette=this_cmap,
+                linewidth=0,
+                legend=None,
+                ax=axes,
+            )
+
+    legend_elements = [
+        Line2D(
+            [0],
+            [0],
+            marker="o",
+            color="w",
+            markerfacecolor=colorlist[c],
+            label=cid,
+            markersize=10,
+        )
+        for c, cid in enumerate(final_clone_ids)
+    ]
+    axes.legend(
+        legend_elements,
+        final_clone_ids,
+        handlelength=0.1,
+        loc="upper left",
+        bbox_to_anchor=(1, 1),
+    )
     axes.axis("off")
 
     fig.tight_layout()

From 4901b562d1290ddbfa3f380922d1593d56a87270 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 11:04:09 -0400
Subject: [PATCH 005/125] add logging and tidy for calicost_main

---
 src/calicost/calicost_main.py | 383 +++++++++++++++++++++++-----------
 1 file changed, 264 insertions(+), 119 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index d64c102..b6985f4 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -1,50 +1,91 @@
+import copy
+import functools
+import logging
+import subprocess
 import sys
+import datetime
+from pathlib import Path
+
+import anndata
 import numpy as np
-import scipy
 import pandas as pd
-from pathlib import Path
-from sklearn.metrics import adjusted_rand_score
-from sklearn.cluster import KMeans
 import scanpy as sc
-import anndata
-import logging
+import scipy
+from sklearn.cluster import KMeans
+from sklearn.metrics import adjusted_rand_score
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-logger = logging.getLogger()
-import copy
-from pathlib import Path
-import functools
-import subprocess
 from calicost.arg_parse import *
+from calicost.find_integer_copynumber import *
 from calicost.hmm_NB_BB_phaseswitch import *
-from calicost.utils_distribution_fitting import *
-from calicost.utils_hmrf import *
 from calicost.hmrf import *
+from calicost.parse_input import *
 from calicost.phasing import *
+from calicost.utils_distribution_fitting import *
+from calicost.utils_hmrf import *
 from calicost.utils_IO import *
-from calicost.find_integer_copynumber import *
-from calicost.parse_input import *
 from calicost.utils_plotting import *
 
+"""
+from calicost.hmm_NB_BB_nophasing_v2 import hmm_nophasing_v2
+from calicost.arg_parse import run_parse_n_load, genesnp_to_bininfo
+from calicost.find_integer_copynumber import (hill_climbing_integer_copynumber_fixdiploid,
+                                              hill_climbing_integer_copynumber_oneclone)
+from calicost.hmm_NB_BB_phaseswitch import (combine_similar_states_across_clones,
+                                            similarity_components_rdrbaf_neymanpearson)
+from calicost.hmrf import (aggr_hmrf_reassignment, aggr_hmrfmix_reassignment,
+                           hmrf_concatenate_pipeline, hmrf_reassignment_posterior,
+                           hmrfmix_concatenate_pipeline, hmrfmix_reassignment_posterior,
+                           merge_by_minspots)
+from calicost.phasing import pipeline_baum_welch
+from calicost.utils_hmrf import (load_hmrf_last_iteration, rectangle_initialize_initial_clone,
+                                 rectangle_initialize_initial_clone_mix, reorder_results)
+from calicost.utils_IO import bin_selection_basedon_normal, expand_df_cnv, filter_de_genes_tri
+from calicost.utils_plotting import (argparse, merge_pseudobulk_by_index,
+                                     merge_pseudobulk_by_index_mix, plot_acn_from_df,
+                                     plot_acn_from_df_anotherscheme, plot_clones_in_space,
+                                     plot_individual_spots_in_space, plot_rdr_baf, plt,
+                                    read_configuration_file, read_joint_configuration_file)
+"""
+
+logger = logging.getLogger("calicost")
+logger.setLevel(logging.INFO)
+
+handler = logging.StreamHandler(sys.stdout)
+fhandler = logging.FileHandler('calicost.log', mode="w")
+
+formatter = logging.Formatter("%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s")
+
+handler.setFormatter(formatter)
+fhandler.setFormatter(formatter)
+
+logger.addHandler(handler)
+logger.addHandler(fhandler)
 
 def main(configuration_file):
+    start = datetime.datetime.now()
+
     try:
         config = read_configuration_file(configuration_file)
     except:
         config = read_joint_configuration_file(configuration_file)
-    print("Configurations:")
+
+    logger.info("Configuration settings:")
+
     for k in sorted(list(config.keys())):
-        print(f"\t{k} : {config[k]}")
+        logger.info(f"\t{k} : {config[k]}")
+
+    # NB assuming the B-allele counts are calculated by the cellsnp-lite & Eagle pipeline.  If assuming each spot contains
+    #    a mixture of normal/tumor cells, the tumor proportion path should be provided in the config file.
+    # 
+    # NB load data: 
+    #    - If the data is loaded for the first time: infer phasing using phase-switch HMM 
+    #      (hmm_NB_BB_phaseswitch.py & phasing.py) with output initial_phase.npz, matrices
+    #      in /parsed_inputs
+    # 
+    #    - If the data is already loaded: load the matrices from parsed_inputs folder
+
+    logger.info(f"Running parse and load.")
 
-    # Assuming the B counts are calculated by the cellsnp-lite and Eagle pipeline
-    # If assuming each spot contains a mixture of normal/tumor cells, the tumor proportion should be provided in the config file.
-    # load data
-    ## If the data is loaded for the first time: infer phasing using phase-switch HMM (hmm_NB_BB_phaseswitch.py and phasing.py) -> output initial_phase.npz, matrices in parsed_inputs folder
-    ## If the data is already loaded: load the matrices from parsed_inputs folder
     (
         lengths,
         single_X,
@@ -63,25 +104,32 @@ def main(configuration_file):
         exp_counts,
     ) = run_parse_n_load(config)
 
-    """
-    Initial clustering spots using only BAF values.
-    """
-    # setting transcript count to 0, and baseline so that emission probability calculation will ignore them.
+    logger.info(f"****  Estimating initial clones using BAF only  ****")
+
+    # NB setting transcript & baseline count to 0 so the emission probability will be ignored.
     copy_single_X_rdr = copy.copy(single_X[:, 0, :])
     copy_single_base_nb_mean = copy.copy(single_base_nb_mean)
+
     single_X[:, 0, :] = 0
     single_base_nb_mean[:, :] = 0
 
-    # run HMRF
     for r_hmrf_initialization in range(
         config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]
     ):
+        logger.info(f"Processing HMRF random realization {num_hmrf_initialization_start:d}")
+
         outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}"
+        outdir = Path(outdir)
+
         if config["tumorprop_file"] is None:
+            logger.info(f"Initializing clones ignoring tumor proportion.")
+
             initial_clone_index = rectangle_initialize_initial_clone(
                 coords, config["n_clones"], random_state=r_hmrf_initialization
             )
         else:
+            logger.info(f"Initializing clones based on tumor proportion: {config["tumorprop_file"]}")
+
             initial_clone_index = rectangle_initialize_initial_clone_mix(
                 coords,
                 config["n_clones"],
@@ -90,27 +138,30 @@ def main(configuration_file):
                 random_state=r_hmrf_initialization,
             )
 
-        # create directory
-        p = subprocess.Popen(
-            f"mkdir -p {outdir}",
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True,
-        )
-        out, err = p.communicate()
-        # save clone initialization into npz file
-        prefix = "allspots"
-        if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz").exists():
+        # NB save clone initialization to npz file
+        file_name = Path(f"allspots_nstates{config['n_states']}_sp.npz")
+        file_path = outdir / file_name
+
+        if not file_path.exists():
+            logger.info(f"Creating output dir: {str(outdir)}")
+
+            # TODO exist_ok 
+            outdir.mkdir(parents=True, exist_ok=True)
+
             initial_assignment = np.zeros(single_X.shape[2], dtype=int)
+
             for c, idx in enumerate(initial_clone_index):
                 initial_assignment[idx] = c
-            allres = {"num_iterations": 0, "round-1_assignment": initial_assignment}
-            np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres)
 
-        # run HMRF + HMM
-        # store the results of each iteration of HMRF in a npz file outdir/prefix_nstates{config['n_states']}_sp.npz
-        # if a specific iteration is computed, hmrf will directly load the results from the file
+            np.savez(str(file_path), **{"num_iterations": 0, "round-1_assignment": initial_assignment})
+
+        # ----  HMRF + HMM  ----
+        # 
+        # NB stores the results of each HMRF iteration in a .npz @ ./outdir/prefix_nstates{config['n_states']}_sp.npz
+        #    if a specific iteration is already computed, hmrf will load the results directly from the file.
         if config["tumorprop_file"] is None:
+            logger.info("Solving HMRF concatenate pipeline without tumor proportion.")
+
             hmrf_concatenate_pipeline(
                 outdir,
                 prefix,
@@ -140,6 +191,8 @@ def main(configuration_file):
                 spatial_weight=config["spatial_weight"],
             )
         else:
+            logger.info("Solving HMRF concatenate pipeline with tumor proportion.")
+
             hmrfmix_concatenate_pipeline(
                 outdir,
                 prefix,
@@ -171,11 +224,13 @@ def main(configuration_file):
                 tumorprop_threshold=config["tumorprop_threshold"],
             )
 
-        # merge by thresholding BAF profile similarity
+        logger.info("Loading last HMRF iteration & merging clones based on BAF profile similarity threshold.")
+
+        n_obs = single_X.shape[0]
         res = load_hmrf_last_iteration(
             f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz"
         )
-        n_obs = single_X.shape[0]
+
         if config["tumorprop_file"] is None:
             X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
                 single_X,
@@ -200,8 +255,12 @@ def main(configuration_file):
                 threshold=config["tumorprop_threshold"],
             )
             tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
-        # merge "similar" clones from the initial number of clones.
-        # "similar" defined by Neyman Pearson statistics/ Likelihood ratios P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B)
+            
+        logger.info("Merged pseudo-bulk based on clone index.")
+
+        # NB ratio == P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B)
+        logger.info("Merging similar initial clones based on Neyman-Pearson Likelihood ratio.")
+
         merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(
             X,
             base_nb_mean,
@@ -213,8 +272,10 @@ def main(configuration_file):
             tumor_prop=tumor_prop,
             hmmclass=hmm_nophasing_v2,
         )
-        print(f"BAF clone merging after comparing similarity: {merging_groups}")
-        #
+
+        logger.info(f"BAF clone merging after comparing similarity: {merging_groups}")
+        logger.info(f"Merging similar initial clones based on min. spot threshold of {config["min_spots_per_clone"]}.")
+
         if config["tumorprop_file"] is None:
             merging_groups, merged_res = merge_by_minspots(
                 merged_res["new_assignment"],
@@ -233,13 +294,20 @@ def main(configuration_file):
                 single_tumor_prop=single_tumor_prop,
                 threshold=config["tumorprop_threshold"],
             )
-        print(f"BAF clone merging after requiring minimum # spots: {merging_groups}")
+
+        logger.info(f"BAF clone merging after requiring minimum # spots: {merging_groups}")
+
         n_baf_clones = len(merging_groups)
+
+        file_path = f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz"
+
+        logger.info(f"Writing merged initial clones to {file_path}")
+
         np.savez(
-            f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res
+            file_path, **merged_res
         )
 
-        # load merged results
+        # NB load merged results
         n_obs = single_X.shape[0]
         merged_res = dict(
             np.load(
@@ -247,12 +315,16 @@ def main(configuration_file):
                 allow_pickle=True,
             )
         )
+
         merged_baf_assignment = copy.copy(merged_res["new_assignment"])
         n_baf_clones = len(np.unique(merged_baf_assignment))
+
+        # TODO comment. 
         pred = np.argmax(merged_res["log_gamma"], axis=0)
         pred = np.array(
             [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)]
         )
+
         merged_baf_profiles = np.array(
             [
                 np.where(
@@ -264,18 +336,23 @@ def main(configuration_file):
             ]
         )
 
-        """
-        Refined clustering using BAF and RDR values.
-        """
-        # adding RDR information
+        logger.info("Preparing refinement of initial, merged clones using BAF & RDR  ****")
+        
         if not config["bafonly"]:
-            # Only used when assuming each spot is pure normal or tumor and if we don't know which spots are normal spots.
-            # select normal spots
+            # NB this block only used when assuming each spot is pure normal or pure tumor,
+            #    and if we don't know which spots are normal spots.
+            # 
+            # NB select normal spots
+
+            logger.info("Identifying normal spots.")
+
             if (config["normalidx_file"] is None) and (
                 config["tumorprop_file"] is None
             ):
+                # TODO hardcode
                 EPS_BAF = 0.05
                 PERCENT_NORMAL = 40
+
                 vec_stds = np.std(np.log1p(copy_single_X_rdr @ smooth_mat), axis=0)
                 id_nearnormal_clone = np.argmin(
                     np.sum(
@@ -283,6 +360,7 @@ def main(configuration_file):
                         axis=1,
                     )
                 )
+
                 while True:
                     stdthreshold = np.percentile(
                         vec_stds[merged_res["new_assignment"] == id_nearnormal_clone],
@@ -298,18 +376,19 @@ def main(configuration_file):
                     ):
                         break
                     PERCENT_NORMAL += 10
+
                 pd.Series(barcodes[normal_candidate == True].index).to_csv(
                     f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False
                 )
 
             elif not config["normalidx_file"] is None:
-                # single_base_nb_mean has already been added in loading data step.
+                # NB single_base_nb_mean has been initialized in loading data step (run_parse_n_load - TBC).
                 if not config["tumorprop_file"] is None:
                     logger.warning(
-                        f"Mixed sources of information for normal spots! Using {config['normalidx_file']}"
+                        f"Mixed sources of information for normal spots!  Using {config['normalidx_file']}"
                     )
 
-            # If tumor purity is provided, we can use it to select normal spots.
+            # NB if tumor purity is provided, we can use it to select normal spots.
             else:
                 for prop_threshold in np.arange(0.05, 0.6, 0.05):
                     normal_candidate = single_tumor_prop < prop_threshold
@@ -318,8 +397,13 @@ def main(configuration_file):
                         > single_X.shape[0] * 200
                     ):
                         break
-            # To avoid allele-specific expression that are not relevant to CNA, filter bins where normal pseudobulk has large |BAF - 0.5|
+
+            # NB avoid allele-specific expression that is not relevant to CNA by filtering bins where normal
+            #    pseudobulk has large |BAF - 0.5|
             index_normal = np.where(normal_candidate)[0]
+
+            logger.info("Filtering genomic bins for allele-specific expression based on normal spots.")
+
             (
                 lengths,
                 single_X,
@@ -337,13 +421,18 @@ def main(configuration_file):
                 index_normal,
                 config["geneticmap_file"],
             )
+            
             assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]
+
             df_bininfo = genesnp_to_bininfo(df_gene_snp)
             copy_single_X_rdr = copy.copy(single_X[:, 0, :])
 
-            # If a gene has way higher expression than adjacent genes, its transcript count will dominate RDR values
-            # To avoid the domination, filter out high-UMI DE genes, which may bias RDR estimates
-            # Assume the remaining genes will still carry the CNA info.
+            # NB if a gene has much higher expression than adjacent genes, its transcripts will dominate RDR.
+            #    To avoid this, filter out (high-UMI) DE genes, which may bias estimates, assuming the remaining
+            #    genes will still carry the CNA info.
+
+            logger.info("Filtering genes with expression outliers.")
+
             copy_single_X_rdr, _ = filter_de_genes_tri(
                 exp_counts,
                 df_bininfo,
@@ -351,7 +440,10 @@ def main(configuration_file):
                 sample_list=sample_list,
                 sample_ids=sample_ids,
             )
+
+            # TODO hardcode
             MIN_NORMAL_COUNT_PERBIN = 20
+
             bidx_inconfident = np.where(
                 np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1)
                 < MIN_NORMAL_COUNT_PERBIN
@@ -361,19 +453,21 @@ def main(configuration_file):
             )
             rdr_normal[bidx_inconfident] = 0
             rdr_normal = rdr_normal / np.sum(rdr_normal)
-            copy_single_X_rdr[bidx_inconfident, :] = (
-                0  # avoid ill-defined distributions if normal has 0 count in that bin.
-            )
+
+            # NB avoid ill-defined distributions if normal has 0 counts in bin.
+            copy_single_X_rdr[bidx_inconfident, :] = 0
+
             copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum(
                 copy_single_X_rdr, axis=0
             ).reshape(1, -1)
 
-            # adding back RDR signal
+            # NB restore RDR data.
             single_X[:, 0, :] = copy_single_X_rdr
             single_base_nb_mean = copy_single_base_nb_mean
             n_obs = single_X.shape[0]
 
-            # save binned data
+            logger.info(f"Writing {outdir}/binned_data.npz")
+
             np.savez(
                 f"{outdir}/binned_data.npz",
                 lengths=lengths,
@@ -386,16 +480,19 @@ def main(configuration_file):
                 ),
             )
 
-            # run HMRF on each clone individually to further split BAF clone by RDR+BAF signal
+            logger.info(f"****  Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR  ****")
+
             for bafc in range(n_baf_clones):
+                logger.info(f"Refining BAF clone {bafc}.")    
+
                 prefix = f"clone{bafc}"
                 idx_spots = np.where(merged_baf_assignment == bafc)[0]
-                if (
-                    np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20
-                ):  # put a minimum B allele read count on pseudobulk to split clones
+
+                # NB put a minimum B allele read count on pseudobulk to split clones
+                if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20:
                     continue
-                # initialize clone
-                # write the initialization in a npz file outdir/prefix_nstates{config['n_states']}_smp.npz
+
+                # NB initialize sub-clones within initial, merged BAF clone.
                 if config["tumorprop_file"] is None:
                     initial_clone_index = rectangle_initialize_initial_clone(
                         coords[idx_spots],
@@ -410,24 +507,27 @@ def main(configuration_file):
                         threshold=config["tumorprop_threshold"],
                         random_state=r_hmrf_initialization,
                     )
-                if not Path(
-                    f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz"
-                ).exists():
+
+                # NB write the initialization to .npz @ ./outdir/prefix_nstates{config['n_states']}_smp.npz
+                file_path = Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz")
+
+                if not file_path.exists():
                     initial_assignment = np.zeros(len(idx_spots), dtype=int)
+
                     for c, idx in enumerate(initial_clone_index):
                         initial_assignment[idx] = c
+
                     allres = {
                         "barcodes": barcodes[idx_spots],
                         "num_iterations": 0,
                         "round-1_assignment": initial_assignment,
                     }
-                    np.savez(
-                        f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz",
-                        **allres,
-                    )
+                    
+                    np.savez(str(file_path), **allres)
 
-                # HMRF + HMM using RDR information
+                # HMRF + HMM with RDR
                 copy_slice_sample_ids = copy.copy(sample_ids[idx_spots])
+
                 if config["tumorprop_file"] is None:
                     hmrf_concatenate_pipeline(
                         outdir,
@@ -489,9 +589,11 @@ def main(configuration_file):
                         tumorprop_threshold=config["tumorprop_threshold"],
                     )
 
-            ##### combine results across clones #####
+            logger.info(f"Combining results across clones.")
+
             res_combine = {"prev_assignment": np.zeros(single_X.shape[2], dtype=int)}
             offset_clone = 0
+
             for bafc in range(n_baf_clones):
                 prefix = f"clone{bafc}"
                 allres = dict(
@@ -515,7 +617,9 @@ def main(configuration_file):
                     "prev_assignment": allres[f"round{r-1}_assignment"],
                     "new_assignment": allres[f"round{r}_assignment"],
                 }
+
                 idx_spots = np.where(barcodes.isin(allres["barcodes"]))[0]
+
                 if len(np.unique(res["new_assignment"])) == 1:
                     n_merged_clones = 1
                     c = res["new_assignment"][0]
@@ -559,6 +663,9 @@ def main(configuration_file):
                             )
                         )
                         tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
+
+                    logger.info(f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio.")
+
                     merging_groups, merged_res = (
                         similarity_components_rdrbaf_neymanpearson(
                             X,
@@ -572,8 +679,9 @@ def main(configuration_file):
                             hmmclass=hmm_nophasing_v2,
                         )
                     )
-                    print(f"part {bafc} merging_groups: {merging_groups}")
-                    #
+
+                    logger.info(f"BAF+RDR clone {bafc}: merging_groups={merging_groups}")
+                    
                     if config["tumorprop_file"] is None:
                         merging_groups, merged_res = merge_by_minspots(
                             merged_res["new_assignment"],
@@ -594,12 +702,16 @@ def main(configuration_file):
                             single_tumor_prop=single_tumor_prop[idx_spots],
                             threshold=config["tumorprop_threshold"],
                         )
-                    print(
-                        f"part {bafc} merging after requiring minimum # spots: {merging_groups}"
+                    
+                    # TODO what is merging_groups
+                    logger.info(
+                        f"BAF+RDR clone {bafc} merging after requiring minimum # spots: {merging_groups}"
                     )
-                    # compute posterior using the newly merged pseudobulk
+
+                    # NB compute posterior using the newly merged pseudobulk
                     n_merged_clones = len(merging_groups)
                     tmp = copy.copy(merged_res["new_assignment"])
+
                     if config["tumorprop_file"] is None:
                         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
                             single_X[:, :, idx_spots],
@@ -625,7 +737,9 @@ def main(configuration_file):
                                 threshold=config["tumorprop_threshold"],
                             )
                         )
-                    #
+                    
+                    logger.info(f"Running Baum-Welch with refined & merged BAF+RDR clones.")
+
                     merged_res = pipeline_baum_welch(
                         None,
                         np.vstack(
@@ -660,6 +774,9 @@ def main(configuration_file):
                         sample_length=np.ones(X.shape[2], dtype=int) * X.shape[0],
                     )
                     merged_res["new_assignment"] = copy.copy(tmp)
+
+                    logger.info("Combining similar states across clones.")
+
                     merged_res = combine_similar_states_across_clones(
                         X,
                         base_nb_mean,
@@ -689,8 +806,8 @@ def main(configuration_file):
                             for c in range(n_merged_clones)
                         ]
                     ).T
-                #
-                # add to res_combine
+                
+                # NB add to res_combine
                 if len(res_combine) == 1:
                     res_combine.update(
                         {
@@ -739,13 +856,17 @@ def main(configuration_file):
                     merged_res["new_assignment"] + offset_clone
                 )
                 offset_clone += n_merged_clones
-            # temp: make dispersions the same across all clones
+
+            # NB temp: make dispersions the same across all clones
             res_combine["new_alphas"][:, :] = np.max(res_combine["new_alphas"])
             res_combine["new_taus"][:, :] = np.min(res_combine["new_taus"])
-            # end temp
+            # NB end temp
+
             n_final_clones = len(np.unique(res_combine["prev_assignment"]))
-            # per-sample weights across clones
+
+            # NB per-sample weights across clones
             log_persample_weights = np.zeros((n_final_clones, len(sample_list)))
+
             for sidx in range(len(sample_list)):
                 index = np.where(sample_ids == sidx)[0]
                 this_persample_weight = np.bincount(
@@ -757,8 +878,10 @@ def main(configuration_file):
                 log_persample_weights[:, sidx] = log_persample_weights[
                     :, sidx
                 ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
-            # final re-assignment across all clones using estimated RDR + BAF
-            # The following step may not be needed because of other improvements. And it may cause mistakes in some cases.
+
+            # NB final re-assignment across all clones using estimated RDR + BAF
+            #    The following step may not be needed because of other improvements
+            #    and it may cause errors in some cases.
             if config["tumorprop_file"] is None:
                 if config["nodepotential"] == "max":
                     pred = np.vstack(
@@ -767,6 +890,9 @@ def main(configuration_file):
                             for c in range(res_combine["log_gamma"].shape[2])
                         ]
                     ).T
+
+                    logger.info("Aggregating HMRF reassignment with Viterbi.")
+
                     new_assignment, single_llf, total_llf, posterior = (
                         aggr_hmrf_reassignment(
                             single_X,
@@ -785,6 +911,8 @@ def main(configuration_file):
                         )
                     )
                 elif config["nodepotential"] == "weighted_sum":
+                    logger.info("Reassigning HMRF posterior.")
+
                     new_assignment, single_llf, total_llf, posterior = (
                         hmrf_reassignment_posterior(
                             single_X,
@@ -809,6 +937,9 @@ def main(configuration_file):
                             for c in range(res_combine["log_gamma"].shape[2])
                         ]
                     ).T
+
+                    logger.info("Aggregating HMRF mix reassignment with Viterbi.")
+
                     new_assignment, single_llf, total_llf, posterior = (
                         aggr_hmrfmix_reassignment(
                             single_X,
@@ -828,6 +959,8 @@ def main(configuration_file):
                         )
                     )
                 elif config["nodepotential"] == "weighted_sum":
+                    logger.info("Reassigning HMRF mix posterior.")
+
                     new_assignment, single_llf, total_llf, posterior = (
                         hmrfmix_reassignment_posterior(
                             single_X,
@@ -847,18 +980,25 @@ def main(configuration_file):
                     )
             res_combine["total_llf"] = total_llf
             res_combine["new_assignment"] = new_assignment
-            # re-order clones such that normal clones are always clone 0
+
+            # NB re-order clones such that normal clones are always clone 0
             res_combine, posterior = reorder_results(
                 res_combine, posterior, single_tumor_prop
             )
-            # save results
+            
+            logger.info(f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz")
+
             np.savez(
                 f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
                 **res_combine,
             )
+            
+            logger.info(f"Writing {outdir}/posterior_clone_probability.npy")
+
             np.save(f"{outdir}/posterior_clone_probability.npy", posterior)
 
-            ##### infer integer copy #####
+            logger.info("Inferring integer copy numbers")
+
             res_combine = dict(
                 np.load(
                     f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
@@ -867,11 +1007,13 @@ def main(configuration_file):
             )
             final_clone_ids = np.sort(np.unique(res_combine["new_assignment"]))
             nonempty_clone_ids = copy.copy(final_clone_ids)
+
             # add clone 0 as normal clone if it doesn't appear in final_clone_ids
             if not (0 in final_clone_ids):
                 final_clone_ids = np.append(0, final_clone_ids)
             # chr position
             medfix = ["", "_diploid", "_triploid", "_tetraploid"]
+
             for o, max_medploidy in enumerate([None, 2, 3, 4]):
                 # A/B copy number per bin
                 allele_specific_copy = []
@@ -955,10 +1097,10 @@ def main(configuration_file):
                                 finding_distate_failed = True
                                 continue
 
-                    print(
+                    logger.info(
                         f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}"
                     )
-                    #
+                    
                     allele_specific_copy.append(
                         pd.DataFrame(
                             best_integer_copies[
@@ -977,7 +1119,7 @@ def main(configuration_file):
                             columns=np.arange(n_obs),
                         )
                     )
-                    #
+                    
                     state_cnv.append(
                         pd.DataFrame(
                             res_combine["new_log_mu"][:, s].reshape(-1, 1),
@@ -985,6 +1127,7 @@ def main(configuration_file):
                             index=np.arange(config["n_states"]),
                         )
                     )
+
                     state_cnv.append(
                         pd.DataFrame(
                             res_combine["new_p_binom"][:, s].reshape(-1, 1),
@@ -1006,7 +1149,7 @@ def main(configuration_file):
                             index=np.arange(config["n_states"]),
                         )
                     )
-                    #
+                    # DEPRECATE
                     # tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list)
                     # tmpdf.columns = [f"clone{s} A", f"clone{s} B"]
                     bin_Acopy_mappers = {
@@ -1042,13 +1185,15 @@ def main(configuration_file):
                         )
                 if len(state_cnv) == 0:
                     continue
-                # output gene-level copy number
+
+                # NB output gene-level copy number
                 df_genelevel_cnv.to_csv(
                     f"{outdir}/cnv{medfix[o]}_genelevel.tsv",
                     header=True,
                     index=True,
                     sep="\t",
                 )
+
                 # output segment-level copy number
                 allele_specific_copy = pd.concat(allele_specific_copy)
                 df_seglevel_cnv = pd.DataFrame(
@@ -1091,22 +1236,21 @@ def main(configuration_file):
                 #     smooth_mat, adjacency_mat, res_combine["new_assignment"], sample_ids, base_nb_mean, log_persample_weights, config["spatial_weight"], hmmclass=hmm_nophasing_v2)
                 # df_posterior.to_pickle(f"{outdir}/posterior{medfix[o]}.pkl")
 
-            ##### output clone label #####
             df_clone_label = pd.DataFrame(
                 {"clone_label": res_combine["new_assignment"]}, index=barcodes
             )
             if not config["tumorprop_file"] is None:
                 df_clone_label["tumor_proportion"] = single_tumor_prop
+
+            logger.info(f"Writing clone labels to {outdir}/clone_labels.tsv")
+
             df_clone_label.to_csv(
                 f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t"
             )
 
-            ##### plotting #####
-            # make a directory for plots
-            p = subprocess.Popen(f"mkdir -p {outdir}/plots", shell=True)
-            out, err = p.communicate()
+            Path(f"{outdir}/plots").mkdir(parents=True, exist_ok=True)
 
-            # plot RDR and BAF
+            # NB plot RDR and BAF.
             cn_file = f"{outdir}/cnv_diploid_seglevel.tsv"
             fig = plot_rdr_baf(
                 configuration_file,
@@ -1125,7 +1269,8 @@ def main(configuration_file):
                 transparent=True,
                 bbox_inches="tight",
             )
-            # plot allele-specific copy number
+
+            # NB plot allele-specific copy number
             for o, max_medploidy in enumerate([None, 2, 3, 4]):
                 cn_file = f"{outdir}/cnv{medfix[o]}_seglevel.tsv"
                 if not Path(cn_file).exists():
@@ -1262,4 +1407,4 @@ def main(configuration_file):
     )
     args = parser.parse_args()
 
-    main(args.configfile)
+    main(args.configfile)
\ No newline at end of file

From 00813b3104545df1123010ebfa26791fb1a6eb14 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 11:26:34 -0400
Subject: [PATCH 006/125] add logging info for hmrf_concatenate_pipeline.

---
 src/calicost/hmrf.py | 94 ++++++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 29 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index ccc8f0c..e61f13e 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1,32 +1,33 @@
+import copy
 import logging
-from turtle import reset
+import warnings
+from pathlib import Path
+# from turtle import reset
+
+# import networkx as nx
 import numpy as np
 import pandas as pd
-from numba import njit
-import scipy.special
 import scipy.sparse
-from sklearn.mixture import GaussianMixture
+import scipy.special
+from numba import njit
 from sklearn.cluster import KMeans
 from sklearn.metrics import adjusted_rand_score, silhouette_score
+from sklearn.mixture import GaussianMixture
 from sklearn.neighbors import kneighbors_graph
-import networkx as nx
+from statsmodels.tools.sm_exceptions import ValueWarning
 from tqdm import trange
-import copy
-from pathlib import Path
+
 from calicost.hmm_NB_BB_phaseswitch import *
 from calicost.utils_distribution_fitting import *
-from calicost.utils_IO import *
 from calicost.utils_hmrf import *
+from calicost.utils_IO import *
 
-import warnings
-from statsmodels.tools.sm_exceptions import ValueWarning
-
+logger = logging.getLogger(__name__)
 
 ############################################################
 # Pure clone
 ############################################################
 
-
 def hmrf_reassignment_posterior(
     single_X,
     single_base_nb_mean,
@@ -813,9 +814,12 @@ def hmrf_concatenate_pipeline(
     unit_ysquared=3,
     spatial_weight=1.0,
 ):
+    logger.info("Solving hmrf_concatenate_pipeline.")
+
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
-    # checking input
+
+    # NB checking input
     assert not (coords is None and adjacency_mat is None)
     if adjacency_mat is None:
         adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared)
@@ -827,13 +831,18 @@ def hmrf_concatenate_pipeline(
         n_samples = len(unique_sample_ids)
         tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
         sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
+
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
-    # pseudobulk
+
+    logger.info("Merging pseudobulk by clone index")
+
     X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
         single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
     )
-    # initialize HMM parameters by GMM
+
     if (init_log_mu is None) or (init_p_binom is None):
+        logger.info("Initializing HMM parameters by GMM")
+
         init_log_mu, init_p_binom = initialization_by_gmm(
             n_states,
             np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
@@ -846,7 +855,10 @@ def hmrf_concatenate_pipeline(
             in_log_space=False,
             only_minor=False,
         )
-    # initialization parameters for HMM
+    else:
+        logger.info("Using provided HMM initialization parameters")
+
+    # NB initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
         last_p_binom = init_p_binom
@@ -862,14 +874,23 @@ def hmrf_concatenate_pipeline(
     for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
 
-    # HMM
+    logger.info(f"Computing HMM for {max_iter_outer} iterations.")
+
     for r in range(max_iter_outer):
-        # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization
+        # NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. 
+        #    When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should
+        #    contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization
+        logger.info(f"Loading {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+
         allres = np.load(
             f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True
         )
         allres = dict(allres)
+
+        # TODO reads in existing iteration results if required.
         if allres["num_iterations"] > r:
+            logger.info(f"Loading pre-computed HMM results for iteration {r}.")
+
             res = {
                 "new_log_mu": allres[f"round{r}_new_log_mu"],
                 "new_alphas": allres[f"round{r}_new_alphas"],
@@ -885,6 +906,8 @@ def hmrf_concatenate_pipeline(
                 "new_assignment": allres[f"round{r}_assignment"],
             }
         else:
+            logger.info(f"Computing HMM iteration {r}.")
+
             res = pipeline_baum_welch(
                 None,
                 np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
@@ -912,8 +935,11 @@ def hmrf_concatenate_pipeline(
                 tol=tol,
             )
             pred = np.argmax(res["log_gamma"], axis=0)
-            # HMRF clone assignmment
+
+            # NB HMRF clone assignmment
             if nodepotential == "max":
+                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate.")
+
                 new_assignment, single_llf, total_llf = (
                     aggr_hmrf_reassignment_concatenate(
                         single_X,
@@ -931,6 +957,8 @@ def hmrf_concatenate_pipeline(
                     )
                 )
             elif nodepotential == "weighted_sum":
+                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate.")
+
                 new_assignment, single_llf, total_llf = (
                     hmrf_reassignment_posterior_concatenate(
                         single_X,
@@ -947,8 +975,9 @@ def hmrf_concatenate_pipeline(
                     )
                 )
             else:
-                raise Exception("Unknown mode for nodepotential!")
-            # handle the case when one clone has zero spots
+                raise ValueError("Unknown mode for nodepotential!")
+
+            # NB handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
@@ -959,10 +988,11 @@ def hmrf_concatenate_pipeline(
                 )
                 res["log_gamma"] = res["log_gamma"][:, concat_idx]
                 res["pred_cnv"] = res["pred_cnv"][concat_idx]
-            #
+            
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
+
             # append to allres
             for k, v in res.items():
                 if k == "prev_assignment":
@@ -971,10 +1001,15 @@ def hmrf_concatenate_pipeline(
                     allres[f"round{r}_assignment"] = v
                 else:
                     allres[f"round{r}_{k}"] = v
+
             allres["num_iterations"] = r + 1
+
+            logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
-        #
-        # regroup to pseudobulk
+        
+        logger.info(f"Regrouping to pseudobulk for iteration {r}.")
+
         clone_index = [
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
@@ -982,9 +1017,9 @@ def hmrf_concatenate_pipeline(
         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
             single_X, single_base_nb_mean, single_total_bb_RD, clone_index
         )
-        #
+        
         if "mp" in params:
-            print(
+            logger.info(
                 "outer iteration {}: difference between parameters = {}, {}".format(
                     r,
                     np.mean(np.abs(last_log_mu - res["new_log_mu"])),
@@ -992,18 +1027,19 @@ def hmrf_concatenate_pipeline(
                 )
             )
         elif "m" in params:
-            print(
+            logger.info(
                 "outer iteration {}: difference between NB parameters = {}".format(
                     r, np.mean(np.abs(last_log_mu - res["new_log_mu"]))
                 )
             )
         elif "p" in params:
-            print(
+            logger.info(
                 "outer iteration {}: difference between BetaBinom parameters = {}".format(
                     r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
                 )
             )
-        print(
+
+        logger.info(
             "outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )

From 0ecfca4f9d3cc8cbaa5b59a27daafd6f7cd8cb45 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 11:37:33 -0400
Subject: [PATCH 007/125] add logging for hmrfmix_concatenate_pipeline

---
 src/calicost/calicost_main.py |  4 ++
 src/calicost/hmrf.py          | 73 +++++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index b6985f4..10aee62 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -1395,6 +1395,10 @@ def main(configuration_file):
                     bbox_inches="tight",
                 )
 
+    end = datetime.datetime.now()
+    runtime = end - start
+
+    logging.info(f"Complete in {runtime} [seconds].")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index e61f13e..6630068 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1862,9 +1862,12 @@ def hmrfmix_concatenate_pipeline(
     spatial_weight=1.0 / 6,
     tumorprop_threshold=0.5,
 ):
+    logger.info("Solving hmrfix_concatenate_pipeline.")
+
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
-    # spot adjacency matric
+
+    # NB checking inputs
     assert not (coords is None and adjacency_mat is None)
     if adjacency_mat is None:
         adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared)
@@ -1877,7 +1880,9 @@ def hmrfmix_concatenate_pipeline(
         tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
         sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * (-np.log(n_clones))
-    # pseudobulk
+
+    logger.info("Merging pseudobulk by clone index")
+
     X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
         single_X,
         single_base_nb_mean,
@@ -1886,10 +1891,13 @@ def hmrfmix_concatenate_pipeline(
         single_tumor_prop,
         threshold=tumorprop_threshold,
     )
-    # baseline proportion of UMI counts
+
+    # NB baseline proportion of UMI counts
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
-    # initialize HMM parameters by GMM
+    
     if (init_log_mu is None) or (init_p_binom is None):
+        logger.info("Initializing HMM parameters by GMM")
+
         init_log_mu, init_p_binom = initialization_by_gmm(
             n_states,
             np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
@@ -1902,7 +1910,10 @@ def hmrfmix_concatenate_pipeline(
             in_log_space=False,
             only_minor=False,
         )
-    # initialization parameters for HMM
+    else:
+        logger.info("Using provided HMM initialization parameters")
+
+    # NB initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
         last_p_binom = init_p_binom
@@ -1918,14 +1929,26 @@ def hmrfmix_concatenate_pipeline(
     for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
 
-    # HMM
+    logger.info(f"Computing HMM for {max_iter_outer} iterations.")
+
     for r in range(max_iter_outer):
-        # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization
+        """
+        NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists.
+           When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should
+           contain two keys: "num_iterations" and f"round_-1_assignment" for clone
+           initialization
+        """
+        logger.info(f"Loading {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+
         allres = np.load(
             f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True
         )
         allres = dict(allres)
+
+        # TODO reads in existing iteration results if required.
         if allres["num_iterations"] > r:
+            logger.info(f"Loading pre-computed HMM results for iteration {r}.")
+
             res = {
                 "new_log_mu": allres[f"round{r}_new_log_mu"],
                 "new_alphas": allres[f"round{r}_new_alphas"],
@@ -1943,8 +1966,12 @@ def hmrfmix_concatenate_pipeline(
         else:
             sample_length = np.ones(X.shape[2], dtype=int) * X.shape[0]
             remain_kwargs = {"sample_length": sample_length, "lambd": lambd}
+
             if f"round{r-1}_log_gamma" in allres:
                 remain_kwargs["log_gamma"] = allres[f"round{r-1}_log_gamma"]
+
+            logger.info(f"Computing HMM iteration {r}.")
+
             res = pipeline_baum_welch(
                 None,
                 np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape(
@@ -1973,9 +2000,13 @@ def hmrfmix_concatenate_pipeline(
                 tol=tol,
                 **remain_kwargs,
             )
+
             pred = np.argmax(res["log_gamma"], axis=0)
-            # clone assignmment
+            
+            # NB HMRF clone assignmment
             if nodepotential == "max":
+                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment_concatenate.")
+
                 new_assignment, single_llf, total_llf = (
                     aggr_hmrfmix_reassignment_concatenate(
                         single_X,
@@ -1994,6 +2025,8 @@ def hmrfmix_concatenate_pipeline(
                     )
                 )
             elif nodepotential == "weighted_sum":
+                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior_concatenate.")
+
                 new_assignment, single_llf, total_llf = (
                     hmrfmix_reassignment_posterior_concatenate(
                         single_X,
@@ -2011,8 +2044,9 @@ def hmrfmix_concatenate_pipeline(
                     )
                 )
             else:
-                raise Exception("Unknown mode for nodepotential!")
-            # handle the case when one clone has zero spots
+                raise ValueError("Unknown mode for nodepotential!")
+
+            # NB handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
@@ -2036,9 +2070,13 @@ def hmrfmix_concatenate_pipeline(
                 else:
                     allres[f"round{r}_{k}"] = v
             allres["num_iterations"] = r + 1
+
+            logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
-        #
-        # regroup to pseudobulk
+        
+        logger.info(f"Regrouping to pseudobulk for iteration {r}.")
+
         clone_index = [
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
@@ -2051,9 +2089,9 @@ def hmrfmix_concatenate_pipeline(
             single_tumor_prop,
             threshold=tumorprop_threshold,
         )
-        #
+        
         if "mp" in params:
-            print(
+            logger.info(
                 "outer iteration {}: difference between parameters = {}, {}".format(
                     r,
                     np.mean(np.abs(last_log_mu - res["new_log_mu"])),
@@ -2061,18 +2099,19 @@ def hmrfmix_concatenate_pipeline(
                 )
             )
         elif "m" in params:
-            print(
+            logger.info(
                 "outer iteration {}: difference between NB parameters = {}".format(
                     r, np.mean(np.abs(last_log_mu - res["new_log_mu"]))
                 )
             )
         elif "p" in params:
-            print(
+            logger.info(
                 "outer iteration {}: difference between BetaBinom parameters = {}".format(
                     r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
                 )
             )
-        print(
+            
+        logger.info(
             "outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )

From 7eb9ba1fd5bb2ad174170601620f533d38fcd26c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 11:50:33 -0400
Subject: [PATCH 008/125] add logging for hmm_NB_BB_nophasing

---
 src/calicost/hmm_NB_BB_nophasing.py | 91 ++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 29 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index 2a262aa..2450d2f 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -1,25 +1,26 @@
+import copy
 import logging
+
+import networkx as nx
 import numpy as np
-from numba import njit
-from scipy.stats import norm, multivariate_normal, poisson
 import scipy.special
-from scipy.optimize import minimize
-from scipy.optimize import Bounds
-from sklearn.mixture import GaussianMixture
-from tqdm import trange
 import statsmodels.api as sm
+from numba import njit
+from scipy.optimize import Bounds, minimize
+from scipy.stats import multivariate_normal, norm, poisson
+from sklearn.mixture import GaussianMixture
 from statsmodels.base.model import GenericLikelihoodModel
-import copy
+from tqdm import trange
+
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmm import *
-import networkx as nx
 
+logger = logging.getLogger(__name__)
 
 ############################################################
 # whole inference
 ############################################################
 
-
 class hmm_nophasing(object):
     def __init__(self, params="stmp", t=1 - 1e-4):
         """
@@ -34,7 +35,6 @@ def __init__(self, params="stmp", t=1 - 1e-4):
         self.params = params
         self.t = t
 
-    #
     @staticmethod
     def compute_emission_probability_nb_betabinom(
         X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
@@ -68,16 +68,20 @@ def compute_emission_probability_nb_betabinom(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
+        logger.info("Computing emission probability for negative binomial & beta binomial.")
+
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         n_states = log_mu.shape[0]
-        # initialize log_emission
+
+        # NB initialize log_emission
         log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
         log_emission_baf = np.zeros((n_states, n_obs, n_spots))
+
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
-                # expression from NB distribution
+                # NB expression from NB distribution.  Mask is used explicity to separate BAF and BAF+RDR.
                 idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
                     nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
@@ -86,7 +90,7 @@ def compute_emission_probability_nb_betabinom(
                     log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
                         X[idx_nonzero_rdr, 0, s], n, p
                     )
-                # AF from BetaBinom distribution
+                # NB AF from BetaBinom distribution
                 idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
                     log_emission_baf[i, idx_nonzero_baf, s] = (
@@ -97,9 +101,11 @@ def compute_emission_probability_nb_betabinom(
                             (1 - p_binom[i, s]) * taus[i, s],
                         )
                     )
+
+        logger.info("Computed emission probability for negative binomial & beta binomial.")
+
         return log_emission_rdr, log_emission_baf
 
-    #
     @staticmethod
     def compute_emission_probability_nb_betabinom_mix(
         X,
@@ -141,10 +147,13 @@ def compute_emission_probability_nb_betabinom_mix(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
+        logger.info("Computing emission probability for *mixed* negative binomial & beta binommial.")
+
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         n_states = log_mu.shape[0]
+
         # initialize log_emission
         log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
         log_emission_baf = np.zeros((n_states, n_obs, n_spots))
@@ -183,9 +192,11 @@ def compute_emission_probability_nb_betabinom_mix(
                         mix_p_A * taus[i, s],
                         mix_p_B * taus[i, s],
                     )
+
+        logger.info("Computed emission probability for *mixed* negative binomial & beta binommial.")
+
         return log_emission_rdr, log_emission_baf
 
-    #
     @staticmethod
     @njit
     def forward_lattice(
@@ -230,7 +241,6 @@ def forward_lattice(
             cumlen += le
         return log_alpha
 
-    #
     @staticmethod
     @njit
     def backward_lattice(
@@ -276,7 +286,6 @@ def backward_lattice(
             cumlen += le
         return log_beta
 
-    #
     def run_baum_welch_nb_bb(
         self,
         X,
@@ -314,7 +323,9 @@ def run_baum_welch_nb_bb(
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
-        # initialize NB logmean shift and BetaBinom prob
+
+        logger.info("Initialize NB logmean shift, BetaBinom prob and dispersion param inverse.")
+
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
             if init_log_mu is None
@@ -325,29 +336,37 @@ def run_baum_welch_nb_bb(
             if init_p_binom is None
             else init_p_binom
         )
-        # initialize (inverse of) dispersion param in NB and BetaBinom
+
+        # NB initialize (inverse of) dispersion param in NB and BetaBinom
         alphas = (
             0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
         )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
-        # initialize start probability and emission probability
+
+        # NB initialize start probability and emission prob.
         log_startprob = np.log(np.ones(n_states) / n_states)
+
         if n_states > 1:
             transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
             log_transmat = np.zeros((1, 1))
-        # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
+
+        # NB trick to speed up BetaBinom optimization: taking only unique values of 
+        #    (B allele count, total SNP covering read count)
+        logger.info("Constructing unique values matrix for NB and BB.")
+
         unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
             X[:, 0, :], base_nb_mean
         )
         unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
             X[:, 1, :], total_bb_RD
         )
-        # EM algorithm
-        for r in trange(max_iter):
-            # E step
+
+        for r in trange(max_iter, desc="EM algorithm"):
+            logger.info(f"Calculating E-step for iteration {r} of {max_iter}.")
+
             if tumor_prop is None:
                 log_emission_rdr, log_emission_baf = (
                     hmm_nophasing.compute_emission_probability_nb_betabinom(
@@ -369,6 +388,7 @@ def run_baum_welch_nb_bb(
                     )
                 )
                 log_emission = log_emission_rdr + log_emission_baf
+
             log_alpha = hmm_nophasing.forward_lattice(
                 lengths,
                 log_transmat,
@@ -376,6 +396,7 @@ def run_baum_welch_nb_bb(
                 log_emission,
                 log_sitewise_transmat,
             )
+
             log_beta = hmm_nophasing.backward_lattice(
                 lengths,
                 log_transmat,
@@ -383,20 +404,26 @@ def run_baum_welch_nb_bb(
                 log_emission,
                 log_sitewise_transmat,
             )
+
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
+
             log_xi = compute_posterior_transition_nophasing(
                 log_alpha, log_beta, log_transmat, log_emission
             )
-            # M step
+            
+            logger.info(f"Calculating M-step for iteration {r} of {max_iter}.")
+
             if "s" in self.params:
                 new_log_startprob = update_startprob_nophasing(lengths, log_gamma)
                 new_log_startprob = new_log_startprob.flatten()
             else:
                 new_log_startprob = log_startprob
+
             if "t" in self.params:
                 new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag)
             else:
                 new_log_transmat = log_transmat
+
             if "m" in self.params:
                 if tumor_prop is None:
                     new_log_mu, new_alphas = (
@@ -426,6 +453,7 @@ def run_baum_welch_nb_bb(
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
+
             if "p" in self.params:
                 if tumor_prop is None:
                     new_p_binom, new_taus = (
@@ -455,26 +483,31 @@ def run_baum_welch_nb_bb(
             else:
                 new_p_binom = p_binom
                 new_taus = taus
-            # check convergence
-            print(
+
+            logger.info(
+                "EM convergence metrics",
                 np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
                 np.mean(np.abs(new_log_mu - log_mu)),
                 np.mean(np.abs(new_p_binom - p_binom)),
             )
-            print(np.hstack([new_log_mu, new_p_binom]))
+
+            logger.info(np.hstack([new_log_mu, new_p_binom]))
+
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
                 and np.mean(np.abs(new_p_binom - p_binom)) < tol
             ):
                 break
+
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
             log_mu = new_log_mu
             alphas = new_alphas
             p_binom = new_p_binom
             taus = new_taus
+
         return (
             new_log_mu,
             new_alphas,
@@ -483,4 +516,4 @@ def run_baum_welch_nb_bb(
             new_log_startprob,
             new_log_transmat,
             log_gamma,
-        )
+        )
\ No newline at end of file

From 3b10e3f01f95f04c89ff199e1e78ecad2ab192e9 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 11:58:11 -0400
Subject: [PATCH 009/125] adding logging for update params in utils_hmm

---
 src/calicost/utils_hmm.py | 55 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 2a22f4d..9145ae5 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -1,11 +1,15 @@
-import numpy as np
-from numba import njit
 import copy
+import logging
+
+import numpy as np
 import scipy.special
-from tqdm import trange
+from numba import njit
 from sklearn.mixture import GaussianMixture
+from tqdm import trange
+
 from calicost.utils_distribution_fitting import *
 
+logger = logging.getLogger(__name__)
 
 @njit
 def np_max_ax_squeeze(arr, axis=0):
@@ -462,9 +466,12 @@ def update_emission_params_nb_sitewise_uniqvalues(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
+    logger.info("Computing emission params for Negative Binomial (sitewise, unique).")
+
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
+
     # initialization
     new_log_mu = (
         copy.copy(start_log_mu)
@@ -472,6 +479,7 @@ def update_emission_params_nb_sitewise_uniqvalues(
         else np.zeros((n_states, n_spots))
     )
     new_alphas = copy.copy(alphas)
+
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
@@ -617,6 +625,9 @@ def update_emission_params_nb_sitewise_uniqvalues(
                         new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
+
+    logger.info("Computed emission params for Negative Binomial (sitewise, unique).")
+
     return new_log_mu, new_alphas
 
 
@@ -645,6 +656,8 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
+    logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique).")
+
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
@@ -822,6 +835,9 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                         new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
+
+    logger.info("Computed emission params for Negative Binomial Mix (sitewise, unique).")
+
     return new_log_mu, new_alphas
 
 
@@ -850,6 +866,8 @@ def update_emission_params_bb_sitewise_uniqvalues(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
+    logger.info("Computing emission params for Beta Binomial (sitewise, unique).")
+
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
@@ -1042,6 +1060,9 @@ def update_emission_params_bb_sitewise_uniqvalues(
                         new_taus[:, :] = res2.params[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
+
+    logger.info("Computed emission params for Beta Binomial (sitewise, unique).")
+
     return new_p_binom, new_taus
 
 
@@ -1071,6 +1092,8 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
+    logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique).")
+
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
@@ -1293,6 +1316,9 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
                         new_taus[:, :] = res2.params[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
+
+    logger.info("Computed emission params for Beta Binomial Mix (sitewise, unique).")
+
     return new_p_binom, new_taus
 
 
@@ -1381,6 +1407,9 @@ def update_emission_params_nb_nophasing_uniqvalues(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
+
+    logger.info("Computing emission params for Negative Binomial (no phasing, unique).")
+
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
@@ -1532,6 +1561,9 @@ def update_emission_params_nb_nophasing_uniqvalues(
                         new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
+
+    logger.info("Computed emission params for Negative Binomial (no phasing, unique).")
+
     return new_log_mu, new_alphas
 
 
@@ -1559,6 +1591,8 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
+    logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique).")
+
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
@@ -1733,6 +1767,9 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                         new_alphas[:, :] = res2.params[-1]
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
+
+    logger.info("Computed emission params for Negative Binomial Mix (no phasing, unique).")
+
     return new_log_mu, new_alphas
 
 
@@ -1760,6 +1797,8 @@ def update_emission_params_bb_nophasing_uniqvalues(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
+    logger.info("Computing emission params for Beta Binomial (no phasing, unique).")
+
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
@@ -1912,6 +1951,9 @@ def update_emission_params_bb_nophasing_uniqvalues(
 
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
+
+    logger.info("Computed emission params for Beta Binomial (no phasing, unique).")
+
     return new_p_binom, new_taus
 
 
@@ -1940,6 +1982,8 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
+    logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique).")
+
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
@@ -2121,4 +2165,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                         new_taus[:, :] = res2.params[-1]
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
-    return new_p_binom, new_taus
+
+    logger.info("Computed emission params for Beta Binomial Mix (no phasing, unique).")
+
+    return new_p_binom, new_taus
\ No newline at end of file

From a5ddc3992c37dedabc3fc2dc90d4f329e2aeef73 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 12:09:12 -0400
Subject: [PATCH 010/125] add logging of hmm_NB_BB_nophasing_v2

---
 src/calicost/hmm_NB_BB_nophasing.py    |  4 +-
 src/calicost/hmm_NB_BB_nophasing_v2.py | 75 +++++++++++++++++---------
 2 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index 2450d2f..32d94f3 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -324,7 +324,7 @@ def run_baum_welch_nb_bb(
         n_spots = X.shape[2]
         assert n_comp == 2
 
-        logger.info("Initialize NB logmean shift, BetaBinom prob and dispersion param inverse.")
+        logger.info("Initialize Baum-Welch NB logmean shift, BetaBinom prob and dispersion param inverse.")
 
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
@@ -508,6 +508,8 @@ def run_baum_welch_nb_bb(
             p_binom = new_p_binom
             taus = new_taus
 
+        logger.info("Computed Baum-Welch (v2).")
+
         return (
             new_log_mu,
             new_alphas,
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 2563834..a4408f6 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -1,18 +1,21 @@
+import copy
 import logging
+
+import networkx as nx
 import numpy as np
-from numba import njit
-from scipy.stats import norm, multivariate_normal, poisson
 import scipy.special
-from scipy.optimize import minimize
-from scipy.optimize import Bounds
-from sklearn.mixture import GaussianMixture
-from tqdm import trange
 import statsmodels.api as sm
+from numba import njit
+from scipy.optimize import Bounds, minimize
+from scipy.stats import multivariate_normal, norm, poisson
+from sklearn.mixture import GaussianMixture
 from statsmodels.base.model import GenericLikelihoodModel
-import copy
+from tqdm import trange
+
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmm import *
-import networkx as nx
+
+logger = logging.getLogger(__name__)
 
 """
 Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution.
@@ -37,7 +40,6 @@ def __init__(self, params="stmp", t=1 - 1e-4):
         self.params = params
         self.t = t
 
-    #
     @staticmethod
     def compute_emission_probability_nb_betabinom(
         X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
@@ -71,6 +73,8 @@ def compute_emission_probability_nb_betabinom(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
+        logger.info("Computing emission probability for negative binomial & beta binomial.")
+
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
@@ -80,7 +84,7 @@ def compute_emission_probability_nb_betabinom(
         log_emission_baf = np.zeros((n_states, n_obs, n_spots))
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
-                # expression from NB distribution
+                # NB expression from NB distribution. Mask is used explicity to separate BAF and BAF+RDR.
                 idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0]
                 if len(idx_nonzero_rdr) > 0:
                     nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s])
@@ -89,7 +93,7 @@ def compute_emission_probability_nb_betabinom(
                     log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(
                         X[idx_nonzero_rdr, 0, s], n, p
                     )
-                # AF from BetaBinom distribution
+                # NB AF from BetaBinom distribution
                 idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
                 if len(idx_nonzero_baf) > 0:
                     log_emission_baf[i, idx_nonzero_baf, s] = (
@@ -100,9 +104,11 @@ def compute_emission_probability_nb_betabinom(
                             (1 - p_binom[i, s]) * taus[i, s],
                         )
                     )
+
+        logger.info("Computed emission probability for negative binomial & beta binomial.")
+
         return log_emission_rdr, log_emission_baf
 
-    #
     @staticmethod
     def compute_emission_probability_nb_betabinom_mix(
         X,
@@ -144,6 +150,8 @@ def compute_emission_probability_nb_betabinom_mix(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
+        logger.info("Computing emission probability for *mixed* negative binomial & beta binommial.")
+
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
@@ -202,9 +210,11 @@ def compute_emission_probability_nb_betabinom_mix(
                         mix_p_A * taus[i, s],
                         mix_p_B * taus[i, s],
                     )
+
+        logger.info("Computed emission probability for *mixed* negative binomial & beta binommial.")
+
         return log_emission_rdr, log_emission_baf
 
-    #
     @staticmethod
     @njit
     def forward_lattice(
@@ -249,7 +259,6 @@ def forward_lattice(
             cumlen += le
         return log_alpha
 
-    #
     @staticmethod
     @njit
     def backward_lattice(
@@ -295,7 +304,6 @@ def backward_lattice(
             cumlen += le
         return log_beta
 
-    #
     def run_baum_welch_nb_bb(
         self,
         X,
@@ -332,7 +340,9 @@ def run_baum_welch_nb_bb(
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
-        # initialize NB logmean shift and BetaBinom prob
+        
+        logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse.")
+
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
             if init_log_mu is None
@@ -343,12 +353,13 @@ def run_baum_welch_nb_bb(
             if init_p_binom is None
             else init_p_binom
         )
-        # initialize (inverse of) dispersion param in NB and BetaBinom
+        # NB initialize (inverse of) dispersion param in NB and BetaBinom
         alphas = (
             0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
         )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
-        # initialize start probability and emission probability
+
+        # NB initialize start probability and emission probability
         log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
             transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
@@ -358,16 +369,19 @@ def run_baum_welch_nb_bb(
             log_transmat = np.zeros((1, 1))
         # initialize log_gamma
         log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None
-        # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
+
+        # NB a trick to speed up BetaBinom optimization: taking only unique
+        # values of (B allele count, total SNP covering read count)
         unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
             X[:, 0, :], base_nb_mean
         )
         unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
             X[:, 1, :], total_bb_RD
         )
-        # EM algorithm
-        for r in trange(max_iter):
-            # E step
+
+        for r in trange(max_iter, desc="EM algorithm"):
+            logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.")
+
             if tumor_prop is None:
                 log_emission_rdr, log_emission_baf = (
                     hmm_nophasing_v2.compute_emission_probability_nb_betabinom(
@@ -428,6 +442,7 @@ def run_baum_welch_nb_bb(
                         )
                     )
                 log_emission = log_emission_rdr + log_emission_baf
+
             log_alpha = hmm_nophasing_v2.forward_lattice(
                 lengths,
                 log_transmat,
@@ -435,6 +450,7 @@ def run_baum_welch_nb_bb(
                 log_emission,
                 log_sitewise_transmat,
             )
+
             log_beta = hmm_nophasing_v2.backward_lattice(
                 lengths,
                 log_transmat,
@@ -442,11 +458,15 @@ def run_baum_welch_nb_bb(
                 log_emission,
                 log_sitewise_transmat,
             )
+
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
+
             log_xi = compute_posterior_transition_nophasing(
                 log_alpha, log_beta, log_transmat, log_emission
             )
-            # M step
+
+            logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.")
+
             if "s" in self.params:
                 new_log_startprob = update_startprob_nophasing(lengths, log_gamma)
                 new_log_startprob = new_log_startprob.flatten()
@@ -545,14 +565,16 @@ def run_baum_welch_nb_bb(
             else:
                 new_p_binom = p_binom
                 new_taus = taus
+
             # check convergence
-            print(
+            logger.info(
+                "EM convergence metrics (v2)",
                 np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
                 np.mean(np.abs(new_log_mu - log_mu)),
                 np.mean(np.abs(new_p_binom - p_binom)),
             )
-            print(np.hstack([new_log_mu, new_p_binom]))
+            logger.info(np.hstack([new_log_mu, new_p_binom]))
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
@@ -565,6 +587,9 @@ def run_baum_welch_nb_bb(
             alphas = new_alphas
             p_binom = new_p_binom
             taus = new_taus
+
+        logger.info("Computed Baum-Welch (v2).")
+
         return (
             new_log_mu,
             new_alphas,

From 4d23df4d31984fce246d4d9a4169629d2130cf8d Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 12:15:31 -0400
Subject: [PATCH 011/125] add logging to hmm_NB_BB_phaseswitch

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 76 ++++++++++++++++++---------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 0d26b70..07995ed 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -1,21 +1,23 @@
+import copy
 import logging
+
+import networkx as nx
 import numpy as np
-from numba import njit
-from scipy.stats import norm, multivariate_normal, poisson
 import scipy.special
-from scipy.optimize import minimize
-from scipy.optimize import Bounds
-from sklearn.mixture import GaussianMixture
-from tqdm import trange
 import statsmodels.api as sm
+from numba import njit
+from scipy.optimize import Bounds, minimize
+from scipy.stats import multivariate_normal, norm, poisson
+from sklearn.mixture import GaussianMixture
 from statsmodels.base.model import GenericLikelihoodModel
-import copy
-from calicost.utils_hmm import *
-from calicost.utils_distribution_fitting import *
+from tqdm import trange
+
 from calicost.hmm_NB_BB_nophasing import *
 from calicost.hmm_NB_BB_nophasing_v2 import *
-import networkx as nx
+from calicost.utils_distribution_fitting import *
+from calicost.utils_hmm import *
 
+logger = logging.getLogger(__name__)
 
 ############################################################
 # whole inference
@@ -36,7 +38,6 @@ def __init__(self, params="stmp", t=1 - 1e-4):
         self.params = params
         self.t = t
 
-    #
     @staticmethod
     def compute_emission_probability_nb_betabinom(
         X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
@@ -70,6 +71,8 @@ def compute_emission_probability_nb_betabinom(
         log_emission : array, shape (2*n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
+        logger.info("Computing emission probability for negative binomial & beta binomial (sitewise).")
+
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
@@ -110,9 +113,11 @@ def compute_emission_probability_nb_betabinom(
                             p_binom[i, s] * taus[i, s],
                         )
                     )
+
+        logger.info("Computed emission probability for negative binomial & beta binomial (sitewise).")
+
         return log_emission_rdr, log_emission_baf
 
-    #
     @staticmethod
     def compute_emission_probability_nb_betabinom_mix(
         X,
@@ -154,6 +159,8 @@ def compute_emission_probability_nb_betabinom_mix(
         log_emission : array, shape (2*n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
+        logger.info("Computing emission probability for *mixed* negative binomial & beta binomial (sitewise).")
+
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
@@ -204,9 +211,11 @@ def compute_emission_probability_nb_betabinom_mix(
                         mix_p_B * taus[i, s],
                         mix_p_A * taus[i, s],
                     )
+
+        logger.info("Computed emission probability for *mixed* negative binomial & beta binomial (sitewise).")
+
         return log_emission_rdr, log_emission_baf
 
-    #
     @staticmethod
     @njit
     def forward_lattice(
@@ -274,7 +283,6 @@ def forward_lattice(
             cumlen += le
         return log_alpha
 
-    #
     @staticmethod
     @njit
     def backward_lattice(
@@ -338,7 +346,6 @@ def backward_lattice(
             cumlen += le
         return log_beta
 
-    #
     def run_baum_welch_nb_bb(
         self,
         X,
@@ -374,7 +381,9 @@ def run_baum_welch_nb_bb(
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         assert n_comp == 2
-        # initialize NB logmean shift and BetaBinom prob
+
+        logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise).")
+
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
             if init_log_mu is None
@@ -385,12 +394,14 @@ def run_baum_welch_nb_bb(
             if init_p_binom is None
             else init_p_binom
         )
-        # initialize (inverse of) dispersion param in NB and BetaBinom
+
+        # NB initialize (inverse of) dispersion param in NB and BetaBinom
         alphas = (
             0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
         )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
-        # initialize start probability and emission probability
+
+        # NB initialize start probability and emission probability
         log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
             transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
@@ -398,16 +409,19 @@ def run_baum_welch_nb_bb(
             log_transmat = np.log(transmat)
         else:
             log_transmat = np.zeros((1, 1))
-        # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
+
+        # NB a trick to speed up BetaBinom optimization: taking only unique values of 
+        #   (B allele count, total SNP covering read count)
         unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
             X[:, 0, :], base_nb_mean
         )
         unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
             X[:, 1, :], total_bb_RD
         )
-        # EM algorithm
-        for r in trange(max_iter):
-            # E step
+
+        for r in trange(max_iter, desc="EM algorithm (sitewise)"):
+            logger.info(f"Calculating E-step (sitewise) for iteration {r} of {max_iter}.")
+
             if tumor_prop is None:
                 log_emission_rdr, log_emission_baf = (
                     hmm_sitewise.compute_emission_probability_nb_betabinom(
@@ -429,6 +443,7 @@ def run_baum_welch_nb_bb(
                     )
                 )
                 log_emission = log_emission_rdr + log_emission_baf
+
             log_alpha = hmm_sitewise.forward_lattice(
                 lengths,
                 log_transmat,
@@ -436,6 +451,7 @@ def run_baum_welch_nb_bb(
                 log_emission,
                 log_sitewise_transmat,
             )
+
             log_beta = hmm_sitewise.backward_lattice(
                 lengths,
                 log_transmat,
@@ -443,11 +459,15 @@ def run_baum_welch_nb_bb(
                 log_emission,
                 log_sitewise_transmat,
             )
+
             log_gamma = compute_posterior_obs(log_alpha, log_beta)
+
             log_xi = compute_posterior_transition_sitewise(
                 log_alpha, log_beta, log_transmat, log_emission
             )
-            # M step
+
+            logger.info(f"Calculating M-step (sitewise) for iteration {r} of {max_iter}.")
+
             if "s" in self.params:
                 new_log_startprob = update_startprob_sitewise(lengths, log_gamma)
                 new_log_startprob = new_log_startprob.flatten()
@@ -522,13 +542,14 @@ def run_baum_welch_nb_bb(
                 new_p_binom = p_binom
                 new_taus = taus
             # check convergence
-            print(
+            logger.info(
+                "EM convergence metrics (sitewise)",
                 np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
                 np.mean(np.abs(new_log_mu - log_mu)),
                 np.mean(np.abs(new_p_binom - p_binom)),
             )
-            print(np.hstack([new_log_mu, new_p_binom]))
+            logger.info((np.hstack([new_log_mu, new_p_binom]))
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
@@ -541,6 +562,9 @@ def run_baum_welch_nb_bb(
             alphas = new_alphas
             p_binom = new_p_binom
             taus = new_taus
+
+        logger.info("Computed Baum-Welch (sitewise).")
+
         return (
             new_log_mu,
             new_alphas,
@@ -1554,4 +1578,4 @@ def combine_similar_states_across_clones(
 #     merged_res["total_llf"] = np.NAN
 #     merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
 #     merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
-#     return merging_groups, merged_res
+#     return merging_groups, merged_res
\ No newline at end of file

From 29235a2127049dd7efdb240aa9a32bf744588eda Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 12:52:28 -0400
Subject: [PATCH 012/125] add logging to hmrf and utils_IO.

---
 src/calicost/hmrf.py     | 27 ++++++++++++++++++---------
 src/calicost/utils_IO.py | 30 ++++++++++++++++++------------
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 6630068..c6dd459 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -57,10 +57,12 @@ def hmrf_reassignment_posterior(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))  # node potential
     new_assignment = copy.copy(prev_assignment)
-    #
+    
     posterior = np.zeros((N, n_clones))
 
-    for i in trange(N):
+    logger.info("Computing hmrf_reassignment_posterior")
+    
+    for i in trange(N, desc="hmrf_reassignment_posterior"):
         idx = smooth_mat[i, :].nonzero()[1]
         for c in range(n_clones):
             tmp_log_emission_rdr, tmp_log_emission_baf = (
@@ -164,10 +166,10 @@ def aggr_hmrf_reassignment(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+    
     posterior = np.zeros((N, n_clones))
 
-    for i in trange(N):
+    for i in trange(N, desc="aggr_hmrf_reassignment"):
         idx = smooth_mat[i, :].nonzero()[1]
         # idx = np.append(idx, np.array([i]))
         for c in range(n_clones):
@@ -252,10 +254,10 @@ def hmrf_reassignment_posterior_concatenate(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+    
     posterior = np.zeros((N, n_clones))
 
-    for i in trange(N):
+    for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"):
         idx = smooth_mat[i, :].nonzero()[1]
         tmp_log_emission_rdr, tmp_log_emission_baf = (
             hmmclass.compute_emission_probability_nb_betabinom(
@@ -398,10 +400,10 @@ def aggr_hmrf_reassignment_concatenate(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+    
     posterior = np.zeros((N, n_clones))
 
-    for i in trange(N):
+    for i in trange(N, desc="aggr_hmrf_reassignment_concatenate"):
         idx = smooth_mat[i, :].nonzero()[1]
         # idx = np.append(idx, np.array([i]))
         tmp_log_emission_rdr, tmp_log_emission_baf = (
@@ -472,6 +474,8 @@ def merge_by_minspots(
     single_tumor_prop=None,
     threshold=0.5,
 ):
+    logger.info("Merging by min. spots.")
+    
     n_clones = len(np.unique(assignment))
     if n_clones == 1:
         merged_groups = [[assignment[0]]]
@@ -554,6 +558,9 @@ def merge_by_minspots(
             for c in merging_groups
         ]
     )
+    
+    logger.info("Merged by min. spots.")
+    
     return merging_groups, merged_res
 
 
@@ -591,6 +598,8 @@ def hmrf_pipeline(
     unit_ysquared=3,
     spatial_weight=1.0,
 ):
+    logger.info("Solving hmrf_pipeline.")
+    
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
     # spot adjacency matric
@@ -637,7 +646,7 @@ def hmrf_pipeline(
     last_assignment = np.zeros(single_X.shape[2], dtype=int)
     for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
-    # HMM
+
     for r in range(max_iter_outer):
         if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists():
             ##### initialize with the parameters of last iteration #####
diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index 82138a2..bda22d8 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -1,28 +1,31 @@
+import copy
+import logging
 import sys
+from pathlib import Path
+
+import anndata
 import numpy as np
-import scipy
-import copy
 import pandas as pd
-from pathlib import Path
+import scanpy as sc
+import scipy
+from sklearn.cluster import KMeans
+from sklearn.kernel_ridge import KernelRidge
 from sklearn.metrics import adjusted_rand_score
 from sklearn.neighbors import LocalOutlierFactor
-from sklearn.kernel_ridge import KernelRidge
-from sklearn.cluster import KMeans
-import scanpy as sc
-import anndata
-import logging
-
+"""
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
 )
-logger = logging.getLogger()
+"""
+logger = logging.getLogger(__name__)
 
-from calicost.utils_phase_switch import *
-from calicost.utils_distribution_fitting import *
 import subprocess
 
+from calicost.utils_distribution_fitting import *
+from calicost.utils_phase_switch import *
+
 
 def load_data(
     spaceranger_dir,
@@ -43,6 +46,8 @@ def load_data(
             f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!"
         )
 
+        raise RuntimeError()
+
     adata.layers["count"] = adata.X.A.astype(int)
     cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz")
     cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz")
@@ -247,6 +252,7 @@ def load_joint_data(
             logging.error(
                 f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!"
             )
+            raise RuntimeError()
 
         adatatmp.layers["count"] = adatatmp.X.A
         # reorder anndata spots to have the same order as df_this_barcode

From 96869395c0754621abf5b7cb0e9c3a7cab2db3cf Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 12:59:14 -0400
Subject: [PATCH 013/125] add logging to hmrf_pipeline

---
 src/calicost/hmrf.py | 59 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index c6dd459..322b61e 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -602,8 +602,10 @@ def hmrf_pipeline(
     
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
-    # spot adjacency matric
+    
+    # NB checking input
     assert not (coords is None and adjacency_mat is None)
+    
     if adjacency_mat is None:
         adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared)
     if sample_ids is None:
@@ -615,12 +617,16 @@ def hmrf_pipeline(
         tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))}
         sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
-    # pseudobulk
+
+    logger.info("Merging pseudobulk by clone index")
+    
     X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
         single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
     )
-    # initialize HMM parameters by GMM
+
     if (init_log_mu is None) or (init_p_binom is None):
+        logger.info("Initializing HMM parameters by GMM")
+        
         init_log_mu, init_p_binom = initialization_by_gmm(
             n_states,
             X,
@@ -631,7 +637,10 @@ def hmrf_pipeline(
             in_log_space=False,
             only_minor=False,
         )
-    # initialization parameters for HMM
+    else:
+        logger.info("Using provided HMM initialization parameters")
+        
+    # NB initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
         last_p_binom = init_p_binom
@@ -647,9 +656,13 @@ def hmrf_pipeline(
     for c, idx in enumerate(initial_clone_index):
         last_assignment[idx] = c
 
+    logger.info(f"Computing HMM for {max_iter_outer} iterations.")
+        
     for r in range(max_iter_outer):
+        # NB initialize with the parameters of last iteration
         if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists():
-            ##### initialize with the parameters of last iteration #####
+            logger.info(f"Computing HMM iteration {r}.")
+            
             res = pipeline_baum_welch(
                 None,
                 X,
@@ -674,9 +687,13 @@ def hmrf_pipeline(
                 max_iter=max_iter,
                 tol=tol,
             )
+            
             pred = np.argmax(res["log_gamma"], axis=0)
+            
             # clone assignmment
             if nodepotential == "max":
+                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment.")
+                
                 new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(
                     single_X,
                     single_base_nb_mean,
@@ -692,6 +709,8 @@ def hmrf_pipeline(
                     hmmclass=hmmclass,
                 )
             elif nodepotential == "weighted_sum":
+                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior.")
+                
                 new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(
                     single_X,
                     single_base_nb_mean,
@@ -706,36 +725,42 @@ def hmrf_pipeline(
                     hmmclass=hmmclass,
                 )
             else:
-                raise Exception("Unknown mode for nodepotential!")
-            # handle the case when one clone has zero spots
+                raise ValueError("Unknown mode for nodepotential!")
+            
+            # NB handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
                 re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
-            #
+            
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
 
-            # save results
+            logger.info(f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz")
+            
             np.savez(f"{outdir}/round{r}_nstates{n_states}_{params}.npz", **res)
 
         else:
+            logger.info(f"Loading pre-computed HMM results for iteration {r}.")
+            logger.info(f"Loading {outdir}/round{r}_nstates{n_states}_{params}.npz")
+            
             res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz")
 
-        # regroup to pseudobulk
+        logger.info(f"Regrouping to pseudobulk for iteration {r}.")
+
         clone_index = [
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
         ]
+        
         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
             single_X, single_base_nb_mean, single_total_bb_RD, clone_index
         )
 
-        # update last parameter
         if "mp" in params:
-            print(
+            logger.info(
                 "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
                     r,
                     res["total_llf"],
@@ -744,7 +769,7 @@ def hmrf_pipeline(
                 )
             )
         elif "m" in params:
-            print(
+            logger.info(
                 "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
                     r,
                     res["total_llf"],
@@ -752,29 +777,33 @@ def hmrf_pipeline(
                 )
             )
         elif "p" in params:
-            print(
+            logger.info(
                 "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_p_binom - res["new_p_binom"])),
                 )
             )
-        print(
+            
+        logger.info(
             "outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
+        
         if (
             adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
             or len(np.unique(res["new_assignment"])) == 1
         ):
             break
+        
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
         last_alphas = res["new_alphas"]
         last_taus = res["new_taus"]
         last_assignment = res["new_assignment"]
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
+        
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
             this_persample_weight = np.bincount(

From 1da81bf501827899ae3db25421ffa942986ef164 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:12:09 -0400
Subject: [PATCH 014/125] add logging on Weighted_NegativeBinomial

---
 src/calicost/utils_distribution_fitting.py | 60 ++++++++++++++++------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 6f6ec02..a710a2e 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -1,25 +1,31 @@
 import functools
 import inspect
 import logging
+import os
+import time
 
 import numpy as np
 import scipy
-from scipy import linalg, special
-from scipy.special import logsumexp, loggamma
 import scipy.integrate
 import scipy.stats
+import statsmodels
+import statsmodels.api as sm
 from numba import jit, njit
+from scipy import linalg, special
+from scipy.special import loggamma, logsumexp
 from sklearn import cluster
 from sklearn.utils import check_random_state
-import statsmodels
-import statsmodels.api as sm
 from statsmodels.base.model import GenericLikelihoodModel
-import os
 
-os.environ["MKL_NUM_THREADS"] = "1"
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-os.environ["OMP_NUM_THREADS"] = "1"
+logger = logging.getLogger(__name__)
+
+num_threads = "2"
+
+logger.info(f"Setting number of threads for MKL/BLAS/LAPACK/OMP to {num_threads}.")
 
+os.environ["MKL_NUM_THREADS"] = num_threads
+os.environ["OPENBLAS_NUM_THREADS"] = num_threads
+os.environ["OMP_NUM_THREADS"] = num_threads
 
 def convert_params(mean, std):
     """
@@ -29,6 +35,7 @@ def convert_params(mean, std):
     """
     p = mean / std**2
     n = mean * p / (1.0 - p)
+
     return n, p
 
 
@@ -51,35 +58,56 @@ class Weighted_NegativeBinomial(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
-
     def __init__(self, endog, exog, weights, exposure, seed=0, **kwds):
         super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds)
+
+        logger.info(f"Initializing Weighted_NegativeBinomial model for endog.shape = {endog.shape}.")
+
         self.weights = weights
         self.exposure = exposure
         self.seed = seed
 
-    #
     def nloglikeobs(self, params):
         nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
+        
         n, p = convert_params(nb_mean, nb_std)
-        llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
+        return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
+
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         self.exog_names.append("alpha")
+
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
 
-        return super(Weighted_NegativeBinomial, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        logger.info(f"Starting Weighted_NegativeBinomial optimization with start_params = {start_params}.")
+
+        start = time.time()
+
+        # NB see https://www.statsmodels.org/dev/dev/generated/statsmodels.base.model.LikelihoodModelResults.html
+        result = super(Weighted_NegativeBinomial, self).fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            maxfun=maxfun,
+            disp=False,
+            skip_hessian=True,
+            callback=None,
+            full_output=True,
+            retall=False,
+            **kwds
         )
 
+        # NB specific to nm (Nelder-Mead) optimization.
+        niter = result.mle_retvals["iterations"]
+
+        logger.info(f"Finished Weighted_NegativeBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+
+        return result
+
 
 class Weighted_NegativeBinomial_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds):

From 46f692bba7bd307f28bdf0c9c20c09662f367fdf Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:20:57 -0400
Subject: [PATCH 015/125] add logging of emission fitting.

---
 src/calicost/utils_distribution_fitting.py | 115 ++++++++++++++++-----
 1 file changed, 89 insertions(+), 26 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index a710a2e..81b9e9b 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -75,7 +75,7 @@ def nloglikeobs(self, params):
 
         return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
 
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         self.exog_names.append("alpha")
 
         if start_params is None:
@@ -112,34 +112,56 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
 class Weighted_NegativeBinomial_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds):
         super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds)
+
+        logger.info(f"Initializing Weighted_NegativeBinomial_mix model for endog.shape = {endog.shape}.")
+
         self.weights = weights
         self.exposure = exposure
         self.seed = seed
         self.tumor_prop = tumor_prop
 
-    #
     def nloglikeobs(self, params):
         nb_mean = self.exposure * (
             self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop
         )
         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
+
         n, p = convert_params(nb_mean, nb_std)
-        llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
+        return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
+
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         self.exog_names.append("alpha")
+
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
-        return super(Weighted_NegativeBinomial_mix, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+
+        logger.info(f"Starting Weighted_NegativeBinomial_mix optimization with start_params = {start_params}.")
+
+        start = time.time()
+
+        result = super(Weighted_NegativeBinomial_mix, self).fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            maxfun=maxfun,
+            disp=False,
+            skip_hessian=True,
+            callback=None,
+            full_output=True,
+            retall=False,
+            **kwds
         )
 
+        # NB specific to nm (Nelder-Mead) optimization.
+        niter = result.mle_retvals["iterations"]       
+
+        logger.info(f"Finished Weighted_NegativeBinomial_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+
+        return result
+
 
 class Weighted_BetaBinom(GenericLikelihoodModel):
     """
@@ -160,23 +182,23 @@ class Weighted_BetaBinom(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
     """
-
     def __init__(self, endog, exog, weights, exposure, **kwds):
         super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds)
+
+        logger.info(f"Initializing Weighted_BetaBinomial model for endog.shape = {endog.shape}.")
+
         self.weights = weights
         self.exposure = exposure
 
-    #
     def nloglikeobs(self, params):
         a = (self.exog @ params[:-1]) * params[-1]
         b = (1 - self.exog @ params[:-1]) * params[-1]
-        llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         self.exog_names.append("tau")
+
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
@@ -184,34 +206,55 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
                 start_params = np.append(
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
-        return super(Weighted_BetaBinom, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+
+        logger.info(f"Starting Weighted_BetaBinomial optimization with start_params = {start_params}.")
+
+        start = time.time()
+
+        result = super(Weighted_BetaBinom, self).fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            maxfun=maxfun,
+            disp=False,
+            skip_hessian=True,
+            callback=None,
+            full_output=True,
+            retall=False,
+            **kwds
         )
 
+        # NB specific to nm (Nelder-Mead) optimization.
+        niter = result.mle_retvals["iterations"]
+
+        logger.info(f"Finished Weighted_BetaBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+
+        return result
 
 class Weighted_BetaBinom_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds):
         super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds)
+
+        logger.info(f"Initializing Weighted_BetaBinom_mix model for endog.shape = {endog.shape}.")
+
         self.weights = weights
         self.exposure = exposure
         self.tumor_prop = tumor_prop
 
-    #
     def nloglikeobs(self, params):
         a = (
             self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
         ) * params[-1]
+
         b = (
             (1 - self.exog @ params[:-1]) * self.tumor_prop
             + 0.5 * (1 - self.tumor_prop)
         ) * params[-1]
-        llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         self.exog_names.append("tau")
+
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
@@ -219,10 +262,30 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
                 start_params = np.append(
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
-        return super(Weighted_BetaBinom_mix, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+
+        logger.info(f"Starting Weighted_BetaBinom_mix optimization with start_params = {start_params}.")
+
+        start = time.time()
+
+        result = super(Weighted_BetaBinom_mix, self).fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            maxfun=maxfun,
+            disp=False,
+            skip_hessian=True,
+            callback=None,
+            full_output=True,
+            retall=False,
+            **kwds
         )
 
+        # NB specific to nm (Nelder-Mead) optimization.
+        niter = result.mle_retvals["iterations"]
+
+        logger.info(f"Finished Weighted_BetaBinom_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+
+        return result
+
 
 class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
     def __init__(self, endog, exog, tau, weights, exposure, **kwds):

From b44b9994e4eb5fad543e4bf02b8edf02aa0d250d Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:25:33 -0400
Subject: [PATCH 016/125] finish logging utils distribution fitting

---
 src/calicost/utils_distribution_fitting.py | 71 +++++++++++++++++-----
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 81b9e9b..858957c 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -290,64 +290,103 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
 class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
     def __init__(self, endog, exog, tau, weights, exposure, **kwds):
         super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds)
+
+        logger.info(f"Initializing Weighted_BetaBinom_fixdispersion model for endog.shape = {endog.shape}.")
+
         self.tau = tau
         self.weights = weights
         self.exposure = exposure
 
-    #
     def nloglikeobs(self, params):
         a = (self.exog @ params) * self.tau
         b = (1 - self.exog @ params) * self.tau
-        llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = 0.1 * np.ones(self.nparams)
 
-        return super(Weighted_BetaBinom_fixdispersion, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization with start_params = {start_params}.")
+
+        start = time.time()
+
+        result = super(Weighted_BetaBinom_fixdispersion, self).fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            maxfun=maxfun,
+            disp=False,
+            skip_hessian=True,
+            callback=None,
+            full_output=True,
+            retall=False,
+            **kwds
         )
 
+        # NB specific to nm (Nelder-Mead) optimization.
+        niter = result.mle_retvals["iterations"]
+
+        logger.info(f"Finished Weighted_BetaBinom_fixdispersion optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+
+        return result
+
 
 class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds):
         super(Weighted_BetaBinom_fixdispersion_mix, self).__init__(endog, exog, **kwds)
+
+        logger.info(f"Initializing Weighted_BetaBinom_fixdispersion_mix model for endog.shape = {endog.shape}.")
+
         self.tau = tau
         self.weights = weights
         self.exposure = exposure
         self.tumor_prop = tumor_prop
 
-    #
     def nloglikeobs(self, params):
         a = (
             self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
         ) * self.tau
+
         b = (
             (1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
         ) * self.tau
-        llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = 0.1 * np.ones(self.nparams)
 
-        return super(Weighted_BetaBinom_fixdispersion_mix, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
+        logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization with start_params = {start_params}.")
+
+        start = time.time()
+
+        result = super(Weighted_BetaBinom_fixdispersion_mix, self).fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            maxfun=maxfun,
+            disp=False,
+            skip_hessian=True,
+            callback=None,
+            full_output=True,
+            retall=False,
+            **kwds
         )
 
+        # NB specific to nm (Nelder-Mead) optimization.
+        niter = result.mle_retvals["iterations"]
+
+        logger.info(f"Finished Weighted_BetaBinom_fixdispersion_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+
+        return result
 
+# DEPRECATE
 class BAF_Binom(GenericLikelihoodModel):
     """
     Binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].

From a3a569dea7e0a3d91d6a641225b4af5f3bebc592 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:28:03 -0400
Subject: [PATCH 017/125] set error on MKL thread setting in
 calicost_supervised.

---
 src/calicost/calicost_supervised.py | 52 +++++++++++++----------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py
index a881fff..6029d31 100644
--- a/src/calicost/calicost_supervised.py
+++ b/src/calicost/calicost_supervised.py
@@ -1,46 +1,42 @@
+import copy
+import functools
+import logging
+import subprocess
 import sys
+from pathlib import Path
+
+import anndata
+import matplotlib.patches as mpatches
+import mkl
 import numpy as np
-import scipy
 import pandas as pd
-from pathlib import Path
-from sklearn.metrics import adjusted_rand_score
-from sklearn.cluster import KMeans
 import scanpy as sc
-import anndata
-import logging
+import scipy
+import seaborn
+from matplotlib import pyplot as plt
+from matplotlib.lines import Line2D
+from sklearn.cluster import KMeans
+from sklearn.metrics import adjusted_rand_score
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-logger = logging.getLogger()
-import copy
-from pathlib import Path
-import functools
-import subprocess
 from arg_parse import *
+from find_integer_copynumber import *
 from hmm_NB_BB_phaseswitch import *
-from utils_distribution_fitting import *
-from utils_hmrf import *
 from hmrf import *
+from parse_input import *
 from phasing import *
+from utils_distribution_fitting import *
+from utils_hmrf import *
 from utils_IO import *
-from find_integer_copynumber import *
-from parse_input import *
 from utils_plotting import *
 
-from matplotlib import pyplot as plt
-from matplotlib.lines import Line2D
-import matplotlib.patches as mpatches
-import seaborn
-
-plt.rcParams.update({"font.size": 14})
+# DEPRECATE
+# mkl.set_num_threads(1)
 
-import mkl
+logger = logging.getLogger(__name__)
 
-mkl.set_num_threads(1)
+logger.error("MKL_NUM_THREADS set to unity here.")
 
+plt.rcParams.update({"font.size": 14})
 
 def main(configuration_file):
     try:

From 7641681d609438decbedfde4a1e0dc20e7cdc250 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:39:11 -0400
Subject: [PATCH 018/125] fix == bug in setup.py

---
 setup.py | 57 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/setup.py b/setup.py
index c447610..2ae07a8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,33 +1,32 @@
 import setuptools
 
 setuptools.setup(
-        name='calicost',
-        version='v1.0.0',
-        python_requires='>=3.8',
-        packages=['calicost'],
-        package_dir={'': 'src'},
-        author='Cong Ma',
-        author_email='congma@princeton.edu',
-        description='Allele-specific CNAs and spatial cancer clone inference',
-        long_description='CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data',
-        url='https://github.com/raphael-group/CalicoST',
-        install_requires=[
-            'numpy=1.24.4', 
-            'scipy=1.11.3', 
-            'pandas=2.1.1',
-            'scikit-learn=1.3.2',
-            'scanpy=1.9.6',
-            'anndata=0.10.3',
-            'numba=0.60.0',
-            'tqdm=4.66.1',
-            'statsmodels=0.14.0',
-            'networkx=3.2.1',
-            'matplotlib=3.7.3',
-            'seaborn=0.12.2',
-            'pysam=0.22.1',
-            'ete3=3.1.3',
-            'ipykernel'
-        ],
-        include_package_data=True
+    name="calicost",
+    version="v1.0.0",
+    python_requires=">=3.8",
+    packages=["calicost"],
+    package_dir={"": "src"},
+    author="Cong Ma",
+    author_email="congma@princeton.edu",
+    description="Allele-specific CNAs and spatial cancer clone inference",
+    long_description="CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data",
+    url="https://github.com/raphael-group/CalicoST",
+    install_requires=[
+        "numpy==1.24.4",
+        "scipy==1.11.3",
+        "pandas==2.1.1",
+        "scikit-learn==1.3.2",
+        "scanpy==1.9.6",
+        "anndata==0.10.3",
+        "numba==0.60.0",
+        "tqdm==4.66.1",
+        "statsmodels==0.14.0",
+        "networkx==3.2.1",
+        "matplotlib==3.7.3",
+        "seaborn==0.12.2",
+        "pysam==0.22.1",
+        "ete3==3.1.3",
+        "ipykernel",
+    ],
+    include_package_data=True,
 )
-

From a08f2e7396979986dd38513be7af1ca6ea1cc523 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:44:00 -0400
Subject: [PATCH 019/125] fix errors with configuration of inner and outer
 loops

---
 src/calicost/arg_parse.py     |  5 ++++-
 src/calicost/calicost_main.py | 10 +++++-----
 src/calicost/parse_input.py   |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py
index 8bf796a..9acefb9 100644
--- a/src/calicost/arg_parse.py
+++ b/src/calicost/arg_parse.py
@@ -41,7 +41,8 @@ def load_default_config():
         "min_avgumi_per_clone": 10,
         "maxspots_pooling": 7,
         "tumorprop_threshold": 0.5,
-        "max_iter_outer": 20,
+        "max_iter_outer_initial" : 20,
+        "max_iter_outer": 10,
         "nodepotential": "weighted_sum",  # max or weighted_sum
         "initialization_method": "rectangle",  # rectangle or datadrive
         "num_hmrf_initialization_start": 0,
@@ -96,6 +97,7 @@ def load_default_config():
         "min_avgumi_per_clone": "int",
         "maxspots_pooling": "int",
         "tumorprop_threshold": "float",
+        "max_iter_outer_initial" : "int",
         "max_iter_outer": "int",
         "nodepotential": "str",
         "initialization_method": "str",
@@ -155,6 +157,7 @@ def load_default_config():
             "min_avgumi_per_clone",
             "maxspots_pooling",
             "tumorprop_threshold",
+            "max_iter_outer_initial",
             "max_iter_outer",
             "nodepotential",
             "initialization_method",
diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 10aee62..22ce18e 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -175,7 +175,7 @@ def main(configuration_file):
                 smooth_mat=smooth_mat,
                 adjacency_mat=adjacency_mat,
                 sample_ids=sample_ids,
-                max_iter_outer=config["max_iter_outer"],
+                max_iter_outer=config["max_iter_outer_initial"],
                 nodepotential=config["nodepotential"],
                 hmmclass=hmm_nophasing_v2,
                 params="sp",
@@ -207,7 +207,7 @@ def main(configuration_file):
                 smooth_mat=smooth_mat,
                 adjacency_mat=adjacency_mat,
                 sample_ids=sample_ids,
-                max_iter_outer=config["max_iter_outer"],
+                max_iter_outer=config["max_iter_outer_initial"],
                 nodepotential=config["nodepotential"],
                 hmmclass=hmm_nophasing_v2,
                 params="sp",
@@ -542,7 +542,7 @@ def main(configuration_file):
                         smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
                         adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
                         sample_ids=copy_slice_sample_ids,
-                        max_iter_outer=10,
+                        max_iter_outer=config["max_iter_outer"],
                         nodepotential=config["nodepotential"],
                         hmmclass=hmm_nophasing_v2,
                         params="smp",
@@ -572,7 +572,7 @@ def main(configuration_file):
                         smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)],
                         adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)],
                         sample_ids=copy_slice_sample_ids,
-                        max_iter_outer=10,
+                        max_iter_outer=config["max_iter_outer"],
                         nodepotential=config["nodepotential"],
                         hmmclass=hmm_nophasing_v2,
                         params="smp",
@@ -1411,4 +1411,4 @@ def main(configuration_file):
     )
     args = parser.parse_args()
 
-    main(args.configfile)
\ No newline at end of file
+    main(args.configfile)
diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py
index 2585923..f84ef53 100644
--- a/src/calicost/parse_input.py
+++ b/src/calicost/parse_input.py
@@ -185,7 +185,7 @@ def parse_visium(config):
             config["shared_NB_dispersion"],
             config["fix_BB_dispersion"],
             config["shared_BB_dispersion"],
-            30,
+            config["max_iter"],
             1e-3,
             threshold=config["tumorprop_threshold"],
         )

From a51d402ded2e905446dd48b902b7445276e04dcc Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:46:18 -0400
Subject: [PATCH 020/125] fix issues around string and bracks from additional
 logging.

---
 src/calicost/calicost_main.py         | 4 ++--
 src/calicost/hmm_NB_BB_phaseswitch.py | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 22ce18e..b8ed786 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -128,7 +128,7 @@ def main(configuration_file):
                 coords, config["n_clones"], random_state=r_hmrf_initialization
             )
         else:
-            logger.info(f"Initializing clones based on tumor proportion: {config["tumorprop_file"]}")
+            logger.info(f"Initializing clones based on tumor proportion: {config['tumorprop_file']}")
 
             initial_clone_index = rectangle_initialize_initial_clone_mix(
                 coords,
@@ -274,7 +274,7 @@ def main(configuration_file):
         )
 
         logger.info(f"BAF clone merging after comparing similarity: {merging_groups}")
-        logger.info(f"Merging similar initial clones based on min. spot threshold of {config["min_spots_per_clone"]}.")
+        logger.info(f"Merging similar initial clones based on min. spot threshold of {config['min_spots_per_clone']}.")
 
         if config["tumorprop_file"] is None:
             merging_groups, merged_res = merge_by_minspots(
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 07995ed..89a0e53 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -549,7 +549,9 @@ def run_baum_welch_nb_bb(
                 np.mean(np.abs(new_log_mu - log_mu)),
                 np.mean(np.abs(new_p_binom - p_binom)),
             )
-            logger.info((np.hstack([new_log_mu, new_p_binom]))
+            
+            logger.info(np.hstack([new_log_mu, new_p_binom]))
+            
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
@@ -1578,4 +1580,4 @@ def combine_similar_states_across_clones(
 #     merged_res["total_llf"] = np.NAN
 #     merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
 #     merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
-#     return merging_groups, merged_res
\ No newline at end of file
+#     return merging_groups, merged_res

From 849c20f662235cebe02e1591ae3acae48b0bc73b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 13:54:52 -0400
Subject: [PATCH 021/125] fix errors around disp=False (statsmodels) and
 inconsistent conda/pip environment

---
 environment.yml                            | 4 +++-
 src/calicost/utils_distribution_fitting.py | 6 ------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/environment.yml b/environment.yml
index 01058c9..522e4be 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,10 +1,12 @@
-name: calicost_env
+name: calicost
 channels:
   - conda-forge
   - bioconda
   - defaults
 dependencies:
   - python==3.10
+  - numpy==1.24.4
+  - scipy==1.11.3
   - samtools==1.18
   - bcftools==1.18
   - cellsnp-lite
diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 858957c..424cb52 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -93,7 +93,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
-            disp=False,
             skip_hessian=True,
             callback=None,
             full_output=True,
@@ -147,7 +146,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
-            disp=False,
             skip_hessian=True,
             callback=None,
             full_output=True,
@@ -215,7 +213,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
-            disp=False,
             skip_hessian=True,
             callback=None,
             full_output=True,
@@ -271,7 +268,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
-            disp=False,
             skip_hessian=True,
             callback=None,
             full_output=True,
@@ -318,7 +314,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
-            disp=False,
             skip_hessian=True,
             callback=None,
             full_output=True,
@@ -371,7 +366,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
-            disp=False,
             skip_hessian=True,
             callback=None,
             full_output=True,

From ea22b720f501b187a10a350095753100703b7d78 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 14:00:21 -0400
Subject: [PATCH 022/125] fix issue with multiple loggers.

---
 src/calicost/arg_parse.py     | 5 +++--
 src/calicost/calicost_main.py | 6 +++---
 src/calicost/parse_input.py   | 5 +++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py
index 9acefb9..5c9992d 100644
--- a/src/calicost/arg_parse.py
+++ b/src/calicost/arg_parse.py
@@ -3,13 +3,14 @@
 import scipy
 import pandas as pd
 import logging
-
+"""
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
 )
-logger = logging.getLogger()
+"""
+logger = logging.getLogger(__name__)
 
 
 def load_default_config():
diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index b8ed786..a166c89 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -51,15 +51,15 @@
 logger.setLevel(logging.INFO)
 
 handler = logging.StreamHandler(sys.stdout)
-fhandler = logging.FileHandler('calicost.log', mode="w")
+# fhandler = logging.FileHandler('calicost.log', mode="w")
 
 formatter = logging.Formatter("%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s")
 
 handler.setFormatter(formatter)
-fhandler.setFormatter(formatter)
+# fhandler.setFormatter(formatter)
 
 logger.addHandler(handler)
-logger.addHandler(fhandler)
+# logger.addHandler(fhandler)
 
 def main(configuration_file):
     start = datetime.datetime.now()
diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py
index f84ef53..49221c5 100644
--- a/src/calicost/parse_input.py
+++ b/src/calicost/parse_input.py
@@ -7,13 +7,14 @@
 import scanpy as sc
 import anndata
 import logging
-
+"""
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
 )
-logger = logging.getLogger()
+"""
+logger = logging.getLogger(__name__)
 import copy
 from pathlib import Path
 import functools

From b6f94b10d12d0ef98acaf2dded85f07b114ac8a1 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 12 Aug 2024 14:51:34 -0400
Subject: [PATCH 023/125] fix prefix bug

---
 src/calicost/calicost_main.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index a166c89..17c20ea 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -116,7 +116,7 @@ def main(configuration_file):
     for r_hmrf_initialization in range(
         config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]
     ):
-        logger.info(f"Processing HMRF random realization {num_hmrf_initialization_start:d}")
+        logger.info(f"Processing HMRF random realization {r_hmrf_initialization}")
 
         outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}"
         outdir = Path(outdir)
@@ -139,7 +139,9 @@ def main(configuration_file):
             )
 
         # NB save clone initialization to npz file
-        file_name = Path(f"allspots_nstates{config['n_states']}_sp.npz")
+        prefix = "allspots"
+        
+        file_name = Path(f"{prefix}_nstates{config['n_states']}_sp.npz")
         file_path = outdir / file_name
 
         if not file_path.exists():

From 8eb653c4a09a84033185b7a6130e6902d0324669 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:33:23 -0400
Subject: [PATCH 024/125] revert imports & prefix checks.

---
 src/calicost/calicost_main.py | 175 ++++++++++++++++++----------------
 1 file changed, 92 insertions(+), 83 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 17c20ea..0a0a5f2 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -1,59 +1,38 @@
-import copy
-import functools
-import logging
-import subprocess
 import sys
-import datetime
-from pathlib import Path
-
-import anndata
 import numpy as np
-import pandas as pd
-import scanpy as sc
 import scipy
-from sklearn.cluster import KMeans
+import pandas as pd
+from pathlib import Path
 from sklearn.metrics import adjusted_rand_score
+from sklearn.cluster import KMeans
+import scanpy as sc
+import anndata
+import logging
 
+import copy
+from pathlib import Path
+import functools
+import subprocess
 from calicost.arg_parse import *
-from calicost.find_integer_copynumber import *
 from calicost.hmm_NB_BB_phaseswitch import *
-from calicost.hmrf import *
-from calicost.parse_input import *
-from calicost.phasing import *
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmrf import *
+from calicost.hmrf import *
+from calicost.phasing import *
 from calicost.utils_IO import *
+from calicost.find_integer_copynumber import *
+from calicost.parse_input import *
 from calicost.utils_plotting import *
 
-"""
-from calicost.hmm_NB_BB_nophasing_v2 import hmm_nophasing_v2
-from calicost.arg_parse import run_parse_n_load, genesnp_to_bininfo
-from calicost.find_integer_copynumber import (hill_climbing_integer_copynumber_fixdiploid,
-                                              hill_climbing_integer_copynumber_oneclone)
-from calicost.hmm_NB_BB_phaseswitch import (combine_similar_states_across_clones,
-                                            similarity_components_rdrbaf_neymanpearson)
-from calicost.hmrf import (aggr_hmrf_reassignment, aggr_hmrfmix_reassignment,
-                           hmrf_concatenate_pipeline, hmrf_reassignment_posterior,
-                           hmrfmix_concatenate_pipeline, hmrfmix_reassignment_posterior,
-                           merge_by_minspots)
-from calicost.phasing import pipeline_baum_welch
-from calicost.utils_hmrf import (load_hmrf_last_iteration, rectangle_initialize_initial_clone,
-                                 rectangle_initialize_initial_clone_mix, reorder_results)
-from calicost.utils_IO import bin_selection_basedon_normal, expand_df_cnv, filter_de_genes_tri
-from calicost.utils_plotting import (argparse, merge_pseudobulk_by_index,
-                                     merge_pseudobulk_by_index_mix, plot_acn_from_df,
-                                     plot_acn_from_df_anotherscheme, plot_clones_in_space,
-                                     plot_individual_spots_in_space, plot_rdr_baf, plt,
-                                    read_configuration_file, read_joint_configuration_file)
-"""
-
 logger = logging.getLogger("calicost")
 logger.setLevel(logging.INFO)
 
 handler = logging.StreamHandler(sys.stdout)
 # fhandler = logging.FileHandler('calicost.log', mode="w")
 
-formatter = logging.Formatter("%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s")
+formatter = logging.Formatter(
+    "%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s"
+)
 
 handler.setFormatter(formatter)
 # fhandler.setFormatter(formatter)
@@ -61,6 +40,7 @@
 logger.addHandler(handler)
 # logger.addHandler(fhandler)
 
+
 def main(configuration_file):
     start = datetime.datetime.now()
 
@@ -76,12 +56,12 @@ def main(configuration_file):
 
     # NB assuming the B-allele counts are calculated by the cellsnp-lite & Eagle pipeline.  If assuming each spot contains
     #    a mixture of normal/tumor cells, the tumor proportion path should be provided in the config file.
-    # 
-    # NB load data: 
-    #    - If the data is loaded for the first time: infer phasing using phase-switch HMM 
+    #
+    # NB load data:
+    #    - If the data is loaded for the first time: infer phasing using phase-switch HMM
     #      (hmm_NB_BB_phaseswitch.py & phasing.py) with output initial_phase.npz, matrices
     #      in /parsed_inputs
-    # 
+    #
     #    - If the data is already loaded: load the matrices from parsed_inputs folder
 
     logger.info(f"Running parse and load.")
@@ -128,7 +108,9 @@ def main(configuration_file):
                 coords, config["n_clones"], random_state=r_hmrf_initialization
             )
         else:
-            logger.info(f"Initializing clones based on tumor proportion: {config['tumorprop_file']}")
+            logger.info(
+                f"Initializing clones based on tumor proportion: {config['tumorprop_file']}"
+            )
 
             initial_clone_index = rectangle_initialize_initial_clone_mix(
                 coords,
@@ -140,14 +122,14 @@ def main(configuration_file):
 
         # NB save clone initialization to npz file
         prefix = "allspots"
-        
+
         file_name = Path(f"{prefix}_nstates{config['n_states']}_sp.npz")
         file_path = outdir / file_name
 
         if not file_path.exists():
             logger.info(f"Creating output dir: {str(outdir)}")
 
-            # TODO exist_ok 
+            # TODO exist_ok
             outdir.mkdir(parents=True, exist_ok=True)
 
             initial_assignment = np.zeros(single_X.shape[2], dtype=int)
@@ -155,10 +137,13 @@ def main(configuration_file):
             for c, idx in enumerate(initial_clone_index):
                 initial_assignment[idx] = c
 
-            np.savez(str(file_path), **{"num_iterations": 0, "round-1_assignment": initial_assignment})
+            np.savez(
+                str(file_path),
+                **{"num_iterations": 0, "round-1_assignment": initial_assignment},
+            )
 
         # ----  HMRF + HMM  ----
-        # 
+        #
         # NB stores the results of each HMRF iteration in a .npz @ ./outdir/prefix_nstates{config['n_states']}_sp.npz
         #    if a specific iteration is already computed, hmrf will load the results directly from the file.
         if config["tumorprop_file"] is None:
@@ -226,7 +211,9 @@ def main(configuration_file):
                 tumorprop_threshold=config["tumorprop_threshold"],
             )
 
-        logger.info("Loading last HMRF iteration & merging clones based on BAF profile similarity threshold.")
+        logger.info(
+            "Loading last HMRF iteration & merging clones based on BAF profile similarity threshold."
+        )
 
         n_obs = single_X.shape[0]
         res = load_hmrf_last_iteration(
@@ -257,11 +244,13 @@ def main(configuration_file):
                 threshold=config["tumorprop_threshold"],
             )
             tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
-            
+
         logger.info("Merged pseudo-bulk based on clone index.")
 
         # NB ratio == P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B)
-        logger.info("Merging similar initial clones based on Neyman-Pearson Likelihood ratio.")
+        logger.info(
+            "Merging similar initial clones based on Neyman-Pearson Likelihood ratio."
+        )
 
         merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(
             X,
@@ -276,7 +265,9 @@ def main(configuration_file):
         )
 
         logger.info(f"BAF clone merging after comparing similarity: {merging_groups}")
-        logger.info(f"Merging similar initial clones based on min. spot threshold of {config['min_spots_per_clone']}.")
+        logger.info(
+            f"Merging similar initial clones based on min. spot threshold of {config['min_spots_per_clone']}."
+        )
 
         if config["tumorprop_file"] is None:
             merging_groups, merged_res = merge_by_minspots(
@@ -297,7 +288,9 @@ def main(configuration_file):
                 threshold=config["tumorprop_threshold"],
             )
 
-        logger.info(f"BAF clone merging after requiring minimum # spots: {merging_groups}")
+        logger.info(
+            f"BAF clone merging after requiring minimum # spots: {merging_groups}"
+        )
 
         n_baf_clones = len(merging_groups)
 
@@ -305,9 +298,7 @@ def main(configuration_file):
 
         logger.info(f"Writing merged initial clones to {file_path}")
 
-        np.savez(
-            file_path, **merged_res
-        )
+        np.savez(file_path, **merged_res)
 
         # NB load merged results
         n_obs = single_X.shape[0]
@@ -321,7 +312,7 @@ def main(configuration_file):
         merged_baf_assignment = copy.copy(merged_res["new_assignment"])
         n_baf_clones = len(np.unique(merged_baf_assignment))
 
-        # TODO comment. 
+        # TODO comment.
         pred = np.argmax(merged_res["log_gamma"], axis=0)
         pred = np.array(
             [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)]
@@ -338,12 +329,14 @@ def main(configuration_file):
             ]
         )
 
-        logger.info("Preparing refinement of initial, merged clones using BAF & RDR  ****")
-        
+        logger.info(
+            "Preparing refinement of initial, merged clones using BAF & RDR  ****"
+        )
+
         if not config["bafonly"]:
             # NB this block only used when assuming each spot is pure normal or pure tumor,
             #    and if we don't know which spots are normal spots.
-            # 
+            #
             # NB select normal spots
 
             logger.info("Identifying normal spots.")
@@ -404,7 +397,9 @@ def main(configuration_file):
             #    pseudobulk has large |BAF - 0.5|
             index_normal = np.where(normal_candidate)[0]
 
-            logger.info("Filtering genomic bins for allele-specific expression based on normal spots.")
+            logger.info(
+                "Filtering genomic bins for allele-specific expression based on normal spots."
+            )
 
             (
                 lengths,
@@ -423,7 +418,7 @@ def main(configuration_file):
                 index_normal,
                 config["geneticmap_file"],
             )
-            
+
             assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]
 
             df_bininfo = genesnp_to_bininfo(df_gene_snp)
@@ -482,10 +477,12 @@ def main(configuration_file):
                 ),
             )
 
-            logger.info(f"****  Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR  ****")
+            logger.info(
+                f"****  Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR  ****"
+            )
 
             for bafc in range(n_baf_clones):
-                logger.info(f"Refining BAF clone {bafc}.")    
+                logger.info(f"Refining BAF clone {bafc}.")
 
                 prefix = f"clone{bafc}"
                 idx_spots = np.where(merged_baf_assignment == bafc)[0]
@@ -511,7 +508,9 @@ def main(configuration_file):
                     )
 
                 # NB write the initialization to .npz @ ./outdir/prefix_nstates{config['n_states']}_smp.npz
-                file_path = Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz")
+                file_path = Path(
+                    f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz"
+                )
 
                 if not file_path.exists():
                     initial_assignment = np.zeros(len(idx_spots), dtype=int)
@@ -519,13 +518,14 @@ def main(configuration_file):
                     for c, idx in enumerate(initial_clone_index):
                         initial_assignment[idx] = c
 
-                    allres = {
-                        "barcodes": barcodes[idx_spots],
-                        "num_iterations": 0,
-                        "round-1_assignment": initial_assignment,
-                    }
-                    
-                    np.savez(str(file_path), **allres)
+                    np.savez(
+                        str(file_path),
+                        **{
+                            "barcodes": barcodes[idx_spots],
+                            "num_iterations": 0,
+                            "round-1_assignment": initial_assignment,
+                        },
+                    )
 
                 # HMRF + HMM with RDR
                 copy_slice_sample_ids = copy.copy(sample_ids[idx_spots])
@@ -666,7 +666,9 @@ def main(configuration_file):
                         )
                         tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
 
-                    logger.info(f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio.")
+                    logger.info(
+                        f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio."
+                    )
 
                     merging_groups, merged_res = (
                         similarity_components_rdrbaf_neymanpearson(
@@ -682,8 +684,10 @@ def main(configuration_file):
                         )
                     )
 
-                    logger.info(f"BAF+RDR clone {bafc}: merging_groups={merging_groups}")
-                    
+                    logger.info(
+                        f"BAF+RDR clone {bafc}: merging_groups={merging_groups}"
+                    )
+
                     if config["tumorprop_file"] is None:
                         merging_groups, merged_res = merge_by_minspots(
                             merged_res["new_assignment"],
@@ -704,7 +708,7 @@ def main(configuration_file):
                             single_tumor_prop=single_tumor_prop[idx_spots],
                             threshold=config["tumorprop_threshold"],
                         )
-                    
+
                     # TODO what is merging_groups
                     logger.info(
                         f"BAF+RDR clone {bafc} merging after requiring minimum # spots: {merging_groups}"
@@ -739,8 +743,10 @@ def main(configuration_file):
                                 threshold=config["tumorprop_threshold"],
                             )
                         )
-                    
-                    logger.info(f"Running Baum-Welch with refined & merged BAF+RDR clones.")
+
+                    logger.info(
+                        f"Running Baum-Welch with refined & merged BAF+RDR clones."
+                    )
 
                     merged_res = pipeline_baum_welch(
                         None,
@@ -808,7 +814,7 @@ def main(configuration_file):
                             for c in range(n_merged_clones)
                         ]
                     ).T
-                
+
                 # NB add to res_combine
                 if len(res_combine) == 1:
                     res_combine.update(
@@ -987,14 +993,16 @@ def main(configuration_file):
             res_combine, posterior = reorder_results(
                 res_combine, posterior, single_tumor_prop
             )
-            
-            logger.info(f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz")
+
+            logger.info(
+                f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz"
+            )
 
             np.savez(
                 f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz",
                 **res_combine,
             )
-            
+
             logger.info(f"Writing {outdir}/posterior_clone_probability.npy")
 
             np.save(f"{outdir}/posterior_clone_probability.npy", posterior)
@@ -1102,7 +1110,7 @@ def main(configuration_file):
                     logger.info(
                         f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}"
                     )
-                    
+
                     allele_specific_copy.append(
                         pd.DataFrame(
                             best_integer_copies[
@@ -1121,7 +1129,7 @@ def main(configuration_file):
                             columns=np.arange(n_obs),
                         )
                     )
-                    
+
                     state_cnv.append(
                         pd.DataFrame(
                             res_combine["new_log_mu"][:, s].reshape(-1, 1),
@@ -1402,6 +1410,7 @@ def main(configuration_file):
 
     logging.info(f"Complete in {runtime} [seconds].")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(

From b657a9efa3c503c4fc66f12ad75c0b2cbaa3dd6b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:40:33 -0400
Subject: [PATCH 025/125] revert imports in calicost supervised.

---
 src/calicost/calicost_supervised.py | 33 +++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py
index 6029d31..082b638 100644
--- a/src/calicost/calicost_supervised.py
+++ b/src/calicost/calicost_supervised.py
@@ -1,3 +1,36 @@
+import sys
+import numpy as np
+import scipy
+import pandas as pd
+from pathlib import Path
+from sklearn.metrics import adjusted_rand_score
+from sklearn.cluster import KMeans
+import scanpy as sc
+import anndata
+import logging
+
+import copy
+from pathlib import Path
+import functools
+import subprocess
+from arg_parse import *
+from hmm_NB_BB_phaseswitch import *
+from utils_distribution_fitting import *
+from utils_hmrf import *
+from hmrf import *
+from phasing import *
+from utils_IO import *
+from find_integer_copynumber import *
+from parse_input import *
+from utils_plotting import *
+
+from matplotlib import pyplot as plt
+from matplotlib.lines import Line2D
+import matplotlib.patches as mpatches
+import seaborn
+
+import mkl
+
 import copy
 import functools
 import logging

From e340a40d6e28a6b94292de99bfa52ebce1b6211e Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:42:50 -0400
Subject: [PATCH 026/125] revert import for hmm_nophasing

---
 src/calicost/hmm_NB_BB_nophasing.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index 32d94f3..b546989 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -1,19 +1,18 @@
-import copy
 import logging
-
-import networkx as nx
 import numpy as np
-import scipy.special
-import statsmodels.api as sm
 from numba import njit
-from scipy.optimize import Bounds, minimize
-from scipy.stats import multivariate_normal, norm, poisson
+from scipy.stats import norm, multivariate_normal, poisson
+import scipy.special
+from scipy.optimize import minimize
+from scipy.optimize import Bounds
 from sklearn.mixture import GaussianMixture
-from statsmodels.base.model import GenericLikelihoodModel
 from tqdm import trange
-
+import statsmodels.api as sm
+from statsmodels.base.model import GenericLikelihoodModel
+import copy
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmm import *
+import networkx as nx
 
 logger = logging.getLogger(__name__)
 
@@ -518,4 +517,4 @@ def run_baum_welch_nb_bb(
             new_log_startprob,
             new_log_transmat,
             log_gamma,
-        )
\ No newline at end of file
+        )

From 25b35d1281a887abb67b9181f7fdcc476a175a39 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:43:20 -0400
Subject: [PATCH 027/125] revert imports for hmm_NB_BB_nophasing_v2

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index a4408f6..61bc562 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -1,19 +1,18 @@
-import copy
 import logging
-
-import networkx as nx
 import numpy as np
-import scipy.special
-import statsmodels.api as sm
 from numba import njit
-from scipy.optimize import Bounds, minimize
-from scipy.stats import multivariate_normal, norm, poisson
+from scipy.stats import norm, multivariate_normal, poisson
+import scipy.special
+from scipy.optimize import minimize
+from scipy.optimize import Bounds
 from sklearn.mixture import GaussianMixture
-from statsmodels.base.model import GenericLikelihoodModel
 from tqdm import trange
-
+import statsmodels.api as sm
+from statsmodels.base.model import GenericLikelihoodModel
+import copy
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmm import *
+import networkx as nx
 
 logger = logging.getLogger(__name__)
 

From bdd50568046cb1bf33bce3c2eb65b33c1ba39aa1 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:45:04 -0400
Subject: [PATCH 028/125] revert imports for hmm phaseswitch

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 89a0e53..f1aefc9 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -1,21 +1,20 @@
-import copy
 import logging
-
-import networkx as nx
 import numpy as np
-import scipy.special
-import statsmodels.api as sm
 from numba import njit
-from scipy.optimize import Bounds, minimize
-from scipy.stats import multivariate_normal, norm, poisson
+from scipy.stats import norm, multivariate_normal, poisson
+import scipy.special
+from scipy.optimize import minimize
+from scipy.optimize import Bounds
 from sklearn.mixture import GaussianMixture
-from statsmodels.base.model import GenericLikelihoodModel
 from tqdm import trange
-
+import statsmodels.api as sm
+from statsmodels.base.model import GenericLikelihoodModel
+import copy
+from calicost.utils_hmm import *
+from calicost.utils_distribution_fitting import *
 from calicost.hmm_NB_BB_nophasing import *
 from calicost.hmm_NB_BB_nophasing_v2 import *
-from calicost.utils_distribution_fitting import *
-from calicost.utils_hmm import *
+import networkx as nx
 
 logger = logging.getLogger(__name__)
 

From 6cc7e6b05059736751a23b6b266d6fe56179c394 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:48:11 -0400
Subject: [PATCH 029/125] revert imports for hmrf

---
 src/calicost/hmrf.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 322b61e..205e2bd 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1,26 +1,25 @@
-import copy
 import logging
-import warnings
-from pathlib import Path
-# from turtle import reset
-
-# import networkx as nx
+from turtle import reset
 import numpy as np
 import pandas as pd
-import scipy.sparse
-import scipy.special
 from numba import njit
+import scipy.special
+import scipy.sparse
+from sklearn.mixture import GaussianMixture
 from sklearn.cluster import KMeans
 from sklearn.metrics import adjusted_rand_score, silhouette_score
-from sklearn.mixture import GaussianMixture
 from sklearn.neighbors import kneighbors_graph
-from statsmodels.tools.sm_exceptions import ValueWarning
+import networkx as nx
 from tqdm import trange
-
+import copy
+from pathlib import Path
 from calicost.hmm_NB_BB_phaseswitch import *
 from calicost.utils_distribution_fitting import *
-from calicost.utils_hmrf import *
 from calicost.utils_IO import *
+from calicost.utils_hmrf import *
+
+import warnings
+from statsmodels.tools.sm_exceptions import ValueWarning
 
 logger = logging.getLogger(__name__)
 

From 3d7f0b7ab49864055158837b1153636fb949639b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:48:21 -0400
Subject: [PATCH 030/125] revert imports for utils IO.

---
 src/calicost/utils_IO.py | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index bda22d8..d570a84 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -1,30 +1,23 @@
-import copy
-import logging
 import sys
-from pathlib import Path
-
-import anndata
 import numpy as np
-import pandas as pd
-import scanpy as sc
 import scipy
-from sklearn.cluster import KMeans
-from sklearn.kernel_ridge import KernelRidge
+import copy
+import pandas as pd
+from pathlib import Path
 from sklearn.metrics import adjusted_rand_score
 from sklearn.neighbors import LocalOutlierFactor
-"""
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-"""
-logger = logging.getLogger(__name__)
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.cluster import KMeans
+import scanpy as sc
+import anndata
+import logging
 
+from calicost.utils_phase_switch import *
+from calicost.utils_distribution_fitting import *
 import subprocess
 
-from calicost.utils_distribution_fitting import *
-from calicost.utils_phase_switch import *
+
+logger = logging.getLogger(__name__)
 
 
 def load_data(

From 3b34e64edeec6df0b6c59f354343e8c5ca194687 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:51:23 -0400
Subject: [PATCH 031/125] revert imports for utils hmm

---
 src/calicost/utils_hmm.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 9145ae5..65153de 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -1,12 +1,9 @@
-import copy
-import logging
-
 import numpy as np
-import scipy.special
 from numba import njit
-from sklearn.mixture import GaussianMixture
+import copy
+import scipy.special
 from tqdm import trange
-
+from sklearn.mixture import GaussianMixture
 from calicost.utils_distribution_fitting import *
 
 logger = logging.getLogger(__name__)
@@ -2168,4 +2165,4 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
 
     logger.info("Computed emission params for Beta Binomial Mix (no phasing, unique).")
 
-    return new_p_binom, new_taus
\ No newline at end of file
+    return new_p_binom, new_taus

From 627d6b9e79faad92029a901fad136ac35bc11967 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:51:35 -0400
Subject: [PATCH 032/125] revert imports for distribution fitting.

---
 src/calicost/utils_distribution_fitting.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 424cb52..6f10938 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -1,21 +1,21 @@
 import functools
 import inspect
 import logging
-import os
-import time
 
 import numpy as np
 import scipy
+from scipy import linalg, special
+from scipy.special import logsumexp, loggamma
 import scipy.integrate
 import scipy.stats
-import statsmodels
-import statsmodels.api as sm
 from numba import jit, njit
-from scipy import linalg, special
-from scipy.special import loggamma, logsumexp
 from sklearn import cluster
 from sklearn.utils import check_random_state
+import statsmodels
+import statsmodels.api as sm
 from statsmodels.base.model import GenericLikelihoodModel
+import os
+
 
 logger = logging.getLogger(__name__)
 

From 06a368b1d095b8ae6f3701d70d7b6075bbca09bd Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:52:21 -0400
Subject: [PATCH 033/125] add utils hmm njit import

---
 src/calicost/utils_hmm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 65153de..dfcba58 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -2,6 +2,7 @@
 from numba import njit
 import copy
 import scipy.special
+from numba import njit
 from tqdm import trange
 from sklearn.mixture import GaussianMixture
 from calicost.utils_distribution_fitting import *

From d34362063c4dcf487bbcd71f6f71f4d6852510ef Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 10:55:12 -0400
Subject: [PATCH 034/125] fix calicost supervised imports.

---
 src/calicost/calicost_supervised.py | 31 -----------------------------
 1 file changed, 31 deletions(-)

diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py
index 082b638..b74c5b6 100644
--- a/src/calicost/calicost_supervised.py
+++ b/src/calicost/calicost_supervised.py
@@ -31,37 +31,6 @@
 
 import mkl
 
-import copy
-import functools
-import logging
-import subprocess
-import sys
-from pathlib import Path
-
-import anndata
-import matplotlib.patches as mpatches
-import mkl
-import numpy as np
-import pandas as pd
-import scanpy as sc
-import scipy
-import seaborn
-from matplotlib import pyplot as plt
-from matplotlib.lines import Line2D
-from sklearn.cluster import KMeans
-from sklearn.metrics import adjusted_rand_score
-
-from arg_parse import *
-from find_integer_copynumber import *
-from hmm_NB_BB_phaseswitch import *
-from hmrf import *
-from parse_input import *
-from phasing import *
-from utils_distribution_fitting import *
-from utils_hmrf import *
-from utils_IO import *
-from utils_plotting import *
-
 # DEPRECATE
 # mkl.set_num_threads(1)
 

From 91dfaa2f6f286e34ef9c1a432f47c92b0e243b01 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 11:17:07 -0400
Subject: [PATCH 035/125] fix runtime logging.

---
 src/calicost/calicost_main.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 0a0a5f2..75734e4 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -13,6 +13,7 @@
 from pathlib import Path
 import functools
 import subprocess
+import datetime
 from calicost.arg_parse import *
 from calicost.hmm_NB_BB_phaseswitch import *
 from calicost.utils_distribution_fitting import *
@@ -1253,11 +1254,16 @@ def main(configuration_file):
                 df_clone_label["tumor_proportion"] = single_tumor_prop
 
             logger.info(f"Writing clone labels to {outdir}/clone_labels.tsv")
-
+            
             df_clone_label.to_csv(
                 f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t"
             )
 
+            end = datetime.datetime.now()
+            runtime = end - start
+
+            logging.info(f"Complete in {runtime} [seconds].")
+            
             Path(f"{outdir}/plots").mkdir(parents=True, exist_ok=True)
 
             # NB plot RDR and BAF.
@@ -1405,11 +1411,6 @@ def main(configuration_file):
                     bbox_inches="tight",
                 )
 
-    end = datetime.datetime.now()
-    runtime = end - start
-
-    logging.info(f"Complete in {runtime} [seconds].")
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

From 2c7c9b5c37b95fab293ce5c7124caa3e93e1e441 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 11:17:36 -0400
Subject: [PATCH 036/125] import bug

---
 src/calicost/utils_distribution_fitting.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 6f10938..191b22d 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy
+import time
 from scipy import linalg, special
 from scipy.special import logsumexp, loggamma
 import scipy.integrate

From 8303c62b5fd0549a54a470f3f65d794534f20179 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 11:26:14 -0400
Subject: [PATCH 037/125] fix spelling mistake

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 61bc562..3d9bdf8 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -149,7 +149,7 @@ def compute_emission_probability_nb_betabinom_mix(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
-        logger.info("Computing emission probability for *mixed* negative binomial & beta binommial.")
+        logger.info("Computing emission probability for *mixed* negative binomial & beta binomial.")
 
         n_obs = X.shape[0]
         n_comp = X.shape[1]
@@ -210,7 +210,7 @@ def compute_emission_probability_nb_betabinom_mix(
                         mix_p_B * taus[i, s],
                     )
 
-        logger.info("Computed emission probability for *mixed* negative binomial & beta binommial.")
+        logger.info("Computed emission probability for *mixed* negative binomial & beta binomial.")
 
         return log_emission_rdr, log_emission_baf
 

From 9ed0df57c577d0ed5519518f93c8cde9938e856a Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 13:04:54 -0400
Subject: [PATCH 038/125] log comparative likelihoods.

---
 src/calicost/utils_hmm.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index dfcba58..5c4fa29 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -470,7 +470,6 @@ def update_emission_params_nb_sitewise_uniqvalues(
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
 
-    # initialization
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None
@@ -614,7 +613,15 @@ def update_emission_params_nb_sitewise_uniqvalues(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                logger.info(f"")
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))

From 96d6fa351f8fc174ef23cf9a8dd814a2450d0706 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 15:30:36 -0400
Subject: [PATCH 039/125] fix

---
 src/calicost/utils_distribution_fitting.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 191b22d..b343868 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -85,7 +85,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
 
-        logger.info(f"Starting Weighted_NegativeBinomial optimization with start_params = {start_params}.")
+        logger.info(f"Starting Weighted_NegativeBinomial optimization @ {start_params}.")
 
         start = time.time()
 
@@ -139,7 +139,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
 
-        logger.info(f"Starting Weighted_NegativeBinomial_mix optimization with start_params = {start_params}.")
+        logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ {start_params}.")
 
         start = time.time()
 
@@ -206,7 +206,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
 
-        logger.info(f"Starting Weighted_BetaBinomial optimization with start_params = {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinomial optimization @ {start_params}.")
 
         start = time.time()
 
@@ -261,7 +261,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
 
-        logger.info(f"Starting Weighted_BetaBinom_mix optimization with start_params = {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinom_mix optimization with @ {start_params}.")
 
         start = time.time()
 
@@ -307,7 +307,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             else:
                 start_params = 0.1 * np.ones(self.nparams)
 
-        logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization with start_params = {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ {start_params}.")
 
         start = time.time()
 
@@ -359,7 +359,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
             else:
                 start_params = 0.1 * np.ones(self.nparams)
 
-        logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization with start_params = {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ {start_params}.")
 
         start = time.time()
 
@@ -401,29 +401,27 @@ class BAF_Binom(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
     """
-
     def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds):
         super(BAF_Binom, self).__init__(endog, exog, **kwds)
+        
         self.weights = weights
         self.exposure = exposure
         self.offset = offset
         self.scaling = scaling
 
-    #
     def nloglikeobs(self, params):
         linear_term = self.exog @ params
         p = self.scaling / (1 + np.exp(-linear_term + self.offset))
-        llf = scipy.stats.binom.logpmf(self.endog, self.exposure, p)
-        neg_sum_llf = -llf.dot(self.weights)
-        return neg_sum_llf
 
-    #
+        return -scipy.stats.binom.logpmf(self.endog, self.exposure, p).dot(self.weights)
+        
     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
             else:
                 start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams)
+                
         return super(BAF_Binom, self).fit(
             start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
         )

From 9d3c141fceae9f65fef94c265d97c370f61fb795 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Thu, 15 Aug 2024 15:45:17 -0400
Subject: [PATCH 040/125] fix

---
 src/calicost/calicost_main.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 75734e4..9e52821 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -25,21 +25,20 @@
 from calicost.parse_input import *
 from calicost.utils_plotting import *
 
+# NB prevent wrap-around of log lines due to high precision printing.
+np.set_printoptions(precision=6)
+
 logger = logging.getLogger("calicost")
 logger.setLevel(logging.INFO)
 
 handler = logging.StreamHandler(sys.stdout)
-# fhandler = logging.FileHandler('calicost.log', mode="w")
-
 formatter = logging.Formatter(
     "%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s"
 )
 
 handler.setFormatter(formatter)
-# fhandler.setFormatter(formatter)
 
 logger.addHandler(handler)
-# logger.addHandler(fhandler)
 
 
 def main(configuration_file):

From 690fd305f6152e63c3c81ea80d67d2d8cba72774 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 09:55:05 -0400
Subject: [PATCH 041/125] improve runtime logging

---
 src/calicost/hmm_NB_BB_nophasing_v2.py |  8 ++++----
 src/calicost/hmrf.py                   | 24 +++++++++++++++++++++---
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 3d9bdf8..fcd7c67 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -72,7 +72,7 @@ def compute_emission_probability_nb_betabinom(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
-        logger.info("Computing emission probability for negative binomial & beta binomial.")
+        # logger.info("Computing emission probability for negative binomial & beta binomial.")
 
         n_obs = X.shape[0]
         n_comp = X.shape[1]
@@ -104,7 +104,7 @@ def compute_emission_probability_nb_betabinom(
                         )
                     )
 
-        logger.info("Computed emission probability for negative binomial & beta binomial.")
+        # logger.info("Computed emission probability for negative binomial & beta binomial.")
 
         return log_emission_rdr, log_emission_baf
 
@@ -149,7 +149,7 @@ def compute_emission_probability_nb_betabinom_mix(
         log_emission : array, shape (n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
-        logger.info("Computing emission probability for *mixed* negative binomial & beta binomial.")
+        # logger.info("Computing emission probability for *mixed* negative binomial & beta binomial.")
 
         n_obs = X.shape[0]
         n_comp = X.shape[1]
@@ -210,7 +210,7 @@ def compute_emission_probability_nb_betabinom_mix(
                         mix_p_B * taus[i, s],
                     )
 
-        logger.info("Computed emission probability for *mixed* negative binomial & beta binomial.")
+        # logger.info("Computed emission probability for *mixed* negative binomial & beta binomial.")
 
         return log_emission_rdr, log_emission_baf
 
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 205e2bd..b47485f 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -59,7 +59,7 @@ def hmrf_reassignment_posterior(
     
     posterior = np.zeros((N, n_clones))
 
-    logger.info("Computing hmrf_reassignment_posterior")
+    logger.info("Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
     
     for i in trange(N, desc="hmrf_reassignment_posterior"):
         idx = smooth_mat[i, :].nonzero()[1]
@@ -132,6 +132,9 @@ def hmrf_reassignment_posterior(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info("Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -157,7 +160,6 @@ def aggr_hmrf_reassignment(
     Choosing clones by Iterated Conditional Modes (Viterbi version):
     for which the emission probability of each spot is a single of HMM state sequence.
     Input format assumption: the RDR/BAF vectors are not shared across clones <- after clone refinement with RDR+BAF signals.
-
     """
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
@@ -168,9 +170,11 @@ def aggr_hmrf_reassignment(
     
     posterior = np.zeros((N, n_clones))
 
+    logger.info("Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
+    
     for i in trange(N, desc="aggr_hmrf_reassignment"):
         idx = smooth_mat[i, :].nonzero()[1]
-        # idx = np.append(idx, np.array([i]))
+
         for c in range(n_clones):
             tmp_log_emission_rdr, tmp_log_emission_baf = (
                 hmmclass.compute_emission_probability_nb_betabinom(
@@ -224,6 +228,9 @@ def aggr_hmrf_reassignment(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info("Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -256,6 +263,8 @@ def hmrf_reassignment_posterior_concatenate(
     
     posterior = np.zeros((N, n_clones))
 
+    logger.info("Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
+    
     for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"):
         idx = smooth_mat[i, :].nonzero()[1]
         tmp_log_emission_rdr, tmp_log_emission_baf = (
@@ -329,6 +338,9 @@ def hmrf_reassignment_posterior_concatenate(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info("Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -393,6 +405,9 @@ def aggr_hmrf_reassignment_concatenate(
     total_llf : float
         The HMRF objective, which is the sum of log likelihood under the optimal labels plus the sum of edge potentials.
     """
+
+    logger.info("Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
+    
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = int(len(pred) / n_obs)
@@ -458,6 +473,9 @@ def aggr_hmrf_reassignment_concatenate(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info("Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:

From c11b2530b5e22bcb37c5bb67f115b99198912763 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 09:58:57 -0400
Subject: [PATCH 042/125] update utils hmm logging to show n spots.

---
 src/calicost/utils_hmm.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 5c4fa29..95be54e 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -464,12 +464,12 @@ def update_emission_params_nb_sitewise_uniqvalues(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
-    logger.info("Computing emission params for Negative Binomial (sitewise, unique).")
-
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
 
+    logger.info("Computing emission params for Negative Binomial (sitewise, unique) with {n_spots} spots and {n_states} states.")
+    
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None
@@ -631,7 +631,7 @@ def update_emission_params_nb_sitewise_uniqvalues(
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
 
-    logger.info("Computed emission params for Negative Binomial (sitewise, unique).")
+    logger.info("Computed emission params for Negative Binomial (sitewise, unique) with {n_spots} spots and {n_states} states.")
 
     return new_log_mu, new_alphas
 
@@ -661,11 +661,12 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
-    logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique).")
-
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
+
+    logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.")
+    
     # initialization
     new_log_mu = (
         copy.copy(start_log_mu)
@@ -841,7 +842,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
 
-    logger.info("Computed emission params for Negative Binomial Mix (sitewise, unique).")
+    logger.info("Computed emission params for Negative Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.")
 
     return new_log_mu, new_alphas
 
@@ -871,11 +872,12 @@ def update_emission_params_bb_sitewise_uniqvalues(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info("Computing emission params for Beta Binomial (sitewise, unique).")
-
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
+
+    logger.info("Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
+    
     # initialization
     new_p_binom = (
         copy.copy(start_p_binom)
@@ -1066,7 +1068,7 @@ def update_emission_params_bb_sitewise_uniqvalues(
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
 
-    logger.info("Computed emission params for Beta Binomial (sitewise, unique).")
+    logger.info("Computed emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
 
     return new_p_binom, new_taus
 
@@ -1097,11 +1099,12 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique).")
-
     n_spots = len(unique_values)
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
+
+    logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.")
+    
     # initialization
     new_p_binom = (
         copy.copy(start_p_binom)
@@ -1322,7 +1325,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
 
-    logger.info("Computed emission params for Beta Binomial Mix (sitewise, unique).")
+    logger.info("Computed emission params for Beta Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.")
 
     return new_p_binom, new_taus
 

From 0a3e3d081676844fdcc97c4f9a5927c76c5a3642 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 13:36:51 -0400
Subject: [PATCH 043/125] fix logging error from multiple args

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index fcd7c67..fe13c5f 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -567,19 +567,18 @@ def run_baum_welch_nb_bb(
 
             # check convergence
             logger.info(
-                "EM convergence metrics (v2)",
-                np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
-                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
-                np.mean(np.abs(new_log_mu - log_mu)),
-                np.mean(np.abs(new_p_binom - p_binom)),
+                f"EM convergence metrics (v2): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}"
             )
+
             logger.info(np.hstack([new_log_mu, new_p_binom]))
+            
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
                 and np.mean(np.abs(new_p_binom - p_binom)) < tol
             ):
                 break
+            
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
             log_mu = new_log_mu

From 7118f8d83ef298a5a1ffb6cb7189b504e990e20d Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:06:46 -0400
Subject: [PATCH 044/125] fix typo in hmrf logging

---
 src/calicost/hmrf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index b47485f..1305f8e 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -709,7 +709,7 @@ def hmrf_pipeline(
             
             # clone assignmment
             if nodepotential == "max":
-                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment.")
+                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment.")
                 
                 new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(
                     single_X,
@@ -726,7 +726,7 @@ def hmrf_pipeline(
                     hmmclass=hmmclass,
                 )
             elif nodepotential == "weighted_sum":
-                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior.")
+                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior.")
                 
                 new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(
                     single_X,
@@ -2060,7 +2060,7 @@ def hmrfmix_concatenate_pipeline(
             
             # NB HMRF clone assignmment
             if nodepotential == "max":
-                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment_concatenate.")
+                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate.")
 
                 new_assignment, single_llf, total_llf = (
                     aggr_hmrfmix_reassignment_concatenate(
@@ -2080,7 +2080,7 @@ def hmrfmix_concatenate_pipeline(
                     )
                 )
             elif nodepotential == "weighted_sum":
-                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior_concatenate.")
+                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate.")
 
                 new_assignment, single_llf, total_llf = (
                     hmrfmix_reassignment_posterior_concatenate(

From 41e0740f6619868a487e77934f612382bbb18f96 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:11:47 -0400
Subject: [PATCH 045/125] update logging strings.

---
 src/calicost/hmrf.py | 187 +++++++++++++++++++++++++++----------------
 1 file changed, 117 insertions(+), 70 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 1305f8e..cf1d68c 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -27,6 +27,7 @@
 # Pure clone
 ############################################################
 
+
 def hmrf_reassignment_posterior(
     single_X,
     single_base_nb_mean,
@@ -56,11 +57,13 @@ def hmrf_reassignment_posterior(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))  # node potential
     new_assignment = copy.copy(prev_assignment)
-    
+
     posterior = np.zeros((N, n_clones))
 
-    logger.info("Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
-    
+    logger.info(
+        f"Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
+
     for i in trange(N, desc="hmrf_reassignment_posterior"):
         idx = smooth_mat[i, :].nonzero()[1]
         for c in range(n_clones):
@@ -133,8 +136,10 @@ def hmrf_reassignment_posterior(
             )
         )
 
-    logger.info("Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
-        
+    logger.info(
+        "Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -167,11 +172,13 @@ def aggr_hmrf_reassignment(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    
+
     posterior = np.zeros((N, n_clones))
 
-    logger.info("Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
-    
+    logger.info(
+        "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     for i in trange(N, desc="aggr_hmrf_reassignment"):
         idx = smooth_mat[i, :].nonzero()[1]
 
@@ -229,8 +236,10 @@ def aggr_hmrf_reassignment(
             )
         )
 
-    logger.info("Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.")
-        
+    logger.info(
+        "Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -260,11 +269,13 @@ def hmrf_reassignment_posterior_concatenate(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    
+
     posterior = np.zeros((N, n_clones))
 
-    logger.info("Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
-    
+    logger.info(
+        "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"):
         idx = smooth_mat[i, :].nonzero()[1]
         tmp_log_emission_rdr, tmp_log_emission_baf = (
@@ -339,8 +350,10 @@ def hmrf_reassignment_posterior_concatenate(
             )
         )
 
-    logger.info("Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
-        
+    logger.info(
+        "Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -406,15 +419,17 @@ def aggr_hmrf_reassignment_concatenate(
         The HMRF objective, which is the sum of log likelihood under the optimal labels plus the sum of edge potentials.
     """
 
-    logger.info("Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
-    
+    logger.info(
+        "Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = int(len(pred) / n_obs)
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    
+
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N, desc="aggr_hmrf_reassignment_concatenate"):
@@ -474,8 +489,10 @@ def aggr_hmrf_reassignment_concatenate(
             )
         )
 
-    logger.info("Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.")
-        
+    logger.info(
+        "Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -492,7 +509,7 @@ def merge_by_minspots(
     threshold=0.5,
 ):
     logger.info("Merging by min. spots.")
-    
+
     n_clones = len(np.unique(assignment))
     if n_clones == 1:
         merged_groups = [[assignment[0]]]
@@ -575,9 +592,9 @@ def merge_by_minspots(
             for c in merging_groups
         ]
     )
-    
+
     logger.info("Merged by min. spots.")
-    
+
     return merging_groups, merged_res
 
 
@@ -616,13 +633,13 @@ def hmrf_pipeline(
     spatial_weight=1.0,
 ):
     logger.info("Solving hmrf_pipeline.")
-    
+
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
-    
+
     # NB checking input
     assert not (coords is None and adjacency_mat is None)
-    
+
     if adjacency_mat is None:
         adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared)
     if sample_ids is None:
@@ -636,14 +653,14 @@ def hmrf_pipeline(
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
 
     logger.info("Merging pseudobulk by clone index")
-    
+
     X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
         single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
     )
 
     if (init_log_mu is None) or (init_p_binom is None):
         logger.info("Initializing HMM parameters by GMM")
-        
+
         init_log_mu, init_p_binom = initialization_by_gmm(
             n_states,
             X,
@@ -656,7 +673,7 @@ def hmrf_pipeline(
         )
     else:
         logger.info("Using provided HMM initialization parameters")
-        
+
     # NB initialization parameters for HMM
     if ("m" in params) and ("p" in params):
         last_log_mu = init_log_mu
@@ -674,12 +691,12 @@ def hmrf_pipeline(
         last_assignment[idx] = c
 
     logger.info(f"Computing HMM for {max_iter_outer} iterations.")
-        
+
     for r in range(max_iter_outer):
         # NB initialize with the parameters of last iteration
         if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists():
             logger.info(f"Computing HMM iteration {r}.")
-            
+
             res = pipeline_baum_welch(
                 None,
                 X,
@@ -704,13 +721,15 @@ def hmrf_pipeline(
                 max_iter=max_iter,
                 tol=tol,
             )
-            
+
             pred = np.argmax(res["log_gamma"], axis=0)
-            
+
             # clone assignmment
             if nodepotential == "max":
-                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment.")
-                
+                logger.info(
+                    "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment."
+                )
+
                 new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(
                     single_X,
                     single_base_nb_mean,
@@ -726,8 +745,10 @@ def hmrf_pipeline(
                     hmmclass=hmmclass,
                 )
             elif nodepotential == "weighted_sum":
-                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior.")
-                
+                logger.info(
+                    "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior."
+                )
+
                 new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(
                     single_X,
                     single_base_nb_mean,
@@ -743,26 +764,28 @@ def hmrf_pipeline(
                 )
             else:
                 raise ValueError("Unknown mode for nodepotential!")
-            
+
             # NB handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
                 remaining_clones = np.sort(np.unique(new_assignment))
                 re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
-            
+
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
 
-            logger.info(f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz")
-            
+            logger.info(
+                f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz"
+            )
+
             np.savez(f"{outdir}/round{r}_nstates{n_states}_{params}.npz", **res)
 
         else:
             logger.info(f"Loading pre-computed HMM results for iteration {r}.")
             logger.info(f"Loading {outdir}/round{r}_nstates{n_states}_{params}.npz")
-            
+
             res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz")
 
         logger.info(f"Regrouping to pseudobulk for iteration {r}.")
@@ -771,7 +794,7 @@ def hmrf_pipeline(
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
         ]
-        
+
         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
             single_X, single_base_nb_mean, single_total_bb_RD, clone_index
         )
@@ -801,26 +824,26 @@ def hmrf_pipeline(
                     np.mean(np.abs(last_p_binom - res["new_p_binom"])),
                 )
             )
-            
+
         logger.info(
             "outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
-        
+
         if (
             adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
             or len(np.unique(res["new_assignment"])) == 1
         ):
             break
-        
+
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
         last_alphas = res["new_alphas"]
         last_taus = res["new_taus"]
         last_assignment = res["new_assignment"]
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
-        
+
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
             this_persample_weight = np.bincount(
@@ -932,7 +955,7 @@ def hmrf_concatenate_pipeline(
     logger.info(f"Computing HMM for {max_iter_outer} iterations.")
 
     for r in range(max_iter_outer):
-        # NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. 
+        # NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists.
         #    When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should
         #    contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization
         logger.info(f"Loading {outdir}/{prefix}_nstates{n_states}_{params}.npz")
@@ -993,7 +1016,9 @@ def hmrf_concatenate_pipeline(
 
             # NB HMRF clone assignmment
             if nodepotential == "max":
-                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate.")
+                logger.info(
+                    "Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate."
+                )
 
                 new_assignment, single_llf, total_llf = (
                     aggr_hmrf_reassignment_concatenate(
@@ -1012,7 +1037,9 @@ def hmrf_concatenate_pipeline(
                     )
                 )
             elif nodepotential == "weighted_sum":
-                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate.")
+                logger.info(
+                    "Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate."
+                )
 
                 new_assignment, single_llf, total_llf = (
                     hmrf_reassignment_posterior_concatenate(
@@ -1043,7 +1070,7 @@ def hmrf_concatenate_pipeline(
                 )
                 res["log_gamma"] = res["log_gamma"][:, concat_idx]
                 res["pred_cnv"] = res["pred_cnv"][concat_idx]
-            
+
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
@@ -1059,10 +1086,12 @@ def hmrf_concatenate_pipeline(
 
             allres["num_iterations"] = r + 1
 
-            logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+            logger.info(
+                f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
+            )
 
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
-        
+
         logger.info(f"Regrouping to pseudobulk for iteration {r}.")
 
         clone_index = [
@@ -1072,7 +1101,7 @@ def hmrf_concatenate_pipeline(
         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
             single_X, single_base_nb_mean, single_total_bb_RD, clone_index
         )
-        
+
         if "mp" in params:
             logger.info(
                 "outer iteration {}: difference between parameters = {}, {}".format(
@@ -1218,9 +1247,10 @@ def aggr_hmrfmix_reassignment(
             + spatial_weight * w_edge
             - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
         )
-    #
-    # compute total log likelihood log P(X | Z) + log P(Z)
+
+    # NB compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
+
     for i in range(N):
         total_llf += np.sum(
             spatial_weight
@@ -1255,14 +1285,19 @@ def hmrfmix_reassignment_posterior(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+
+    logger.info(
+        f"Computing hmrfmix_reassignment_posterior for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
+
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
-    #
+
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
         idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
+
         for c in range(n_clones):
             if np.sum(single_base_nb_mean) > 0:
                 this_pred_cnv = res["pred_cnv"][:, c]
@@ -1277,6 +1312,7 @@ def hmrfmix_reassignment_posterior(
                 }
             else:
                 kwargs = {}
+
             tmp_log_emission_rdr, tmp_log_emission_baf = (
                 hmmclass.compute_emission_probability_nb_betabinom_mix(
                     np.sum(single_X[:, :, idx], axis=2, keepdims=True),
@@ -1290,6 +1326,7 @@ def hmrfmix_reassignment_posterior(
                     **kwargs,
                 )
             )
+
             if (
                 np.sum(single_base_nb_mean[:, idx] > 0) > 0
                 and np.sum(single_total_bb_RD[:, idx] > 0) > 0
@@ -1299,7 +1336,7 @@ def hmrfmix_reassignment_posterior(
                     * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0)
                     / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0)
                 )
-                # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
+
                 single_llf[i, c] = ratio_nonzeros * np.sum(
                     scipy.special.logsumexp(
                         tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c],
@@ -1327,20 +1364,21 @@ def hmrfmix_reassignment_posterior(
         w_node = single_llf[i, :]
         w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
+
         for j in adjacency_mat[i, :].nonzero()[1]:
             if new_assignment[j] >= 0:
-                # w_edge[new_assignment[j]] += 1
                 w_edge[new_assignment[j]] += adjacency_mat[i, j]
         new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
-        #
+
         posterior[i, :] = np.exp(
             w_node
             + spatial_weight * w_edge
             - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
         )
 
-    # compute total log likelihood log P(X | Z) + log P(Z)
+    # NB compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
+
     for i in range(N):
         total_llf += np.sum(
             spatial_weight
@@ -1348,6 +1386,9 @@ def hmrfmix_reassignment_posterior(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info(f"Computed hmrfmix_reassignment_posterior.")
+
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -1949,7 +1990,7 @@ def hmrfmix_concatenate_pipeline(
 
     # NB baseline proportion of UMI counts
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
-    
+
     if (init_log_mu is None) or (init_p_binom is None):
         logger.info("Initializing HMM parameters by GMM")
 
@@ -2057,10 +2098,12 @@ def hmrfmix_concatenate_pipeline(
             )
 
             pred = np.argmax(res["log_gamma"], axis=0)
-            
+
             # NB HMRF clone assignmment
             if nodepotential == "max":
-                logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate.")
+                logger.info(
+                    "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate."
+                )
 
                 new_assignment, single_llf, total_llf = (
                     aggr_hmrfmix_reassignment_concatenate(
@@ -2080,7 +2123,9 @@ def hmrfmix_concatenate_pipeline(
                     )
                 )
             elif nodepotential == "weighted_sum":
-                logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate.")
+                logger.info(
+                    "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate."
+                )
 
                 new_assignment, single_llf, total_llf = (
                     hmrfmix_reassignment_posterior_concatenate(
@@ -2126,10 +2171,12 @@ def hmrfmix_concatenate_pipeline(
                     allres[f"round{r}_{k}"] = v
             allres["num_iterations"] = r + 1
 
-            logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+            logger.info(
+                f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
+            )
 
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
-        
+
         logger.info(f"Regrouping to pseudobulk for iteration {r}.")
 
         clone_index = [
@@ -2144,7 +2191,7 @@ def hmrfmix_concatenate_pipeline(
             single_tumor_prop,
             threshold=tumorprop_threshold,
         )
-        
+
         if "mp" in params:
             logger.info(
                 "outer iteration {}: difference between parameters = {}, {}".format(
@@ -2165,7 +2212,7 @@ def hmrfmix_concatenate_pipeline(
                     r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
                 )
             )
-            
+
         logger.info(
             "outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])

From ffe9f5d52dd0cdb53e8f7589ff8c2caa1761cff5 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:27:29 -0400
Subject: [PATCH 046/125] logging edits

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 6 +++---
 src/calicost/hmrf.py                   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index fe13c5f..ab60265 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -565,12 +565,9 @@ def run_baum_welch_nb_bb(
                 new_p_binom = p_binom
                 new_taus = taus
 
-            # check convergence
             logger.info(
                 f"EM convergence metrics (v2): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}"
             )
-
-            logger.info(np.hstack([new_log_mu, new_p_binom]))
             
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
@@ -588,6 +585,9 @@ def run_baum_welch_nb_bb(
 
         logger.info("Computed Baum-Welch (v2).")
 
+        logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
+        logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
+        
         return (
             new_log_mu,
             new_alphas,
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index cf1d68c..8d177dc 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -724,7 +724,7 @@ def hmrf_pipeline(
 
             pred = np.argmax(res["log_gamma"], axis=0)
 
-            # clone assignmment
+            # NB clone assignmment
             if nodepotential == "max":
                 logger.info(
                     "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment."

From c75ec89d4ace3861f84a7ab2b2ce681dc6b4e70c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:30:24 -0400
Subject: [PATCH 047/125] log hmrfmix_reassignment_posterior_concatenate.

---
 src/calicost/hmrf.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 8d177dc..04f32bf 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1705,8 +1705,13 @@ def hmrfmix_reassignment_posterior_concatenate(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+
+    logger.info(
+        f"Computing hmrfmix_reassignment_posterior_concatenate for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
+    
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
+    
     if np.sum(single_base_nb_mean) > 0:
         logmu_shift = []
         for c in range(n_clones):
@@ -1729,7 +1734,7 @@ def hmrfmix_reassignment_posterior_concatenate(
         }
     else:
         kwargs = {}
-    #
+    
     posterior = np.zeros((N, n_clones))
 
     for i in trange(N):
@@ -1759,7 +1764,7 @@ def hmrfmix_reassignment_posterior_concatenate(
                     * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0)
                     / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0)
                 )
-                # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)
+                
                 single_llf[i, c] = ratio_nonzeros * np.sum(
                     scipy.special.logsumexp(
                         tmp_log_emission_rdr[:, :, 0]
@@ -1790,19 +1795,20 @@ def hmrfmix_reassignment_posterior_concatenate(
         w_node = single_llf[i, :]
         w_node += log_persample_weights[:, sample_ids[i]]
         w_edge = np.zeros(n_clones)
+        
         for j in adjacency_mat[i, :].nonzero()[1]:
-            # w_edge[new_assignment[j]] += 1
             w_edge[new_assignment[j]] += adjacency_mat[i, j]
         new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
-        #
+
         posterior[i, :] = np.exp(
             w_node
             + spatial_weight * w_edge
             - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
         )
 
-    # compute total log likelihood log P(X | Z) + log P(Z)
+    # NB compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
+    
     for i in range(N):
         total_llf += np.sum(
             spatial_weight
@@ -1810,6 +1816,9 @@ def hmrfmix_reassignment_posterior_concatenate(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info(f"Computed hmrfmix_reassignment_posterior_concatenate.")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:

From 8f3eb261be5f7a9af70727b0dbe0e1ff297b12ec Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:37:43 -0400
Subject: [PATCH 048/125] more logging improvements.  remove deprecated code.

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 134 +-------------------------
 src/calicost/hmrf.py                  |  18 ++--
 2 files changed, 14 insertions(+), 138 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index f1aefc9..d2683a4 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -859,8 +859,8 @@ def pipeline_baum_welch(
             init_log_mu = tmp_log_mu
         if (init_p_binom is None) and ("p" in params):
             init_p_binom = tmp_p_binom
-    print(f"init_log_mu = {init_log_mu}")
-    print(f"init_p_binom = {init_p_binom}")
+
+    logger.info(f"Initial (mu, p):\n{np.hstack([init_log_mu, init_p_binom])}")
 
     # fit HMM-NB-BetaBinom
     # new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb(X, lengths, \
@@ -1426,11 +1426,12 @@ def combine_similar_states_across_clones(
     n_states = res["new_p_binom"].shape[0]
     reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
-    #
+    
     all_test_statistics = compute_neymanpearson_stats(
         X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass
     )
-    # make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small
+    
+    # NB make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small
     for c1 in range(n_clones):
         for c2 in range(c1 + 1, n_clones):
             list_t_neymanpearson = all_test_statistics[(c1, c2)]
@@ -1455,128 +1456,3 @@ def combine_similar_states_across_clones(
                         f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}"
                     )
     return res
-
-
-# def similarity_components_rdrbaf_neymanpearson_posterior(X, base_nb_mean, total_bb_RD, res, threshold=2.0, minlength=10, topk=10, params="smp", tumor_prop=None, hmmclass=hmm_sitewise):
-#     n_obs = X.shape[0]
-#     n_states = res["new_p_binom"].shape[0]
-#     n_clones = X.shape[2]
-#     G = nx.Graph()
-#     G.add_nodes_from( np.arange(n_clones) )
-#     #
-#     def eval_neymanpearson_bafonly(log_emission_baf_c1, log_gamma_c1, log_emission_baf_c2, log_gamma_c2, bidx, n_states, res, p):
-#         assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states
-#         # likelihood under the corresponding state
-#         llf_original = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + log_gamma_c1[:, bidx], axis=0),
-#                                  scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + log_gamma_c2[:, bidx], axis=0))
-#         # likelihood under the switched state
-#         if log_emission_baf_c1.shape[0] == 2 * n_states:
-#             whether_switch = False
-#             pred_c1 = np.argmax(log_gamma_c1[:,bidx[0]])
-#             pred_c2 = np.argmax(log_gamma_c2[:,bidx[0]])
-#             if ( ((res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5)) ^ ((pred_c1 < n_states) == (pred_c2 < n_states)) ):
-#                 whether_switch = True
-#             if not whether_switch:
-#                 switch_log_gamma_c1 = log_gamma_c2
-#                 switch_log_gamma_c2 = log_gamma_c1
-#             else:
-#                 switch_log_gamma_c1 = np.vstack([log_gamma_c2[:n_states,:], log_gamma_c2[n_states:,:]])
-#                 switch_log_gamma_c2 = np.vstack([log_gamma_c1[:n_states,:], log_gamma_c1[n_states:,:]])
-#         else:
-#             switch_log_gamma_c1 = log_gamma_c2
-#             switch_log_gamma_c2 = log_gamma_c1
-#         llf_switch = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0),
-#                                scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + switch_log_gamma_c2[:, bidx], axis=0))
-#         # log likelihood difference
-#         return np.mean(llf_original) - np.mean(llf_switch)
-#     #
-#     def eval_neymanpearson_rdrbaf(log_emission_rdr_c1, log_emission_baf_c1, log_gamma_c1, log_emission_rdr_c2, log_emission_baf_c2, log_gamma_c2, bidx, n_states, res, p):
-#         assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states
-#         # likelihood under the corresponding state
-#         llf_original = 0.5 * np.append(scipy.special.logsumexp((log_emission_rdr_c1+log_emission_baf_c1)[:, bidx] + log_gamma_c1[:, bidx], axis=0), \
-#                                        scipy.special.logsumexp((log_emission_rdr_c2+log_emission_baf_c2)[:, bidx] + log_gamma_c2[:, bidx], axis=0))
-#         # likelihood under the switched state
-#         if log_emission_baf_c1.shape[0] == 2 * n_states:
-#             whether_switch = False
-#             pred_c1 = np.argmax(log_gamma_c1[:,bidx[0]])
-#             pred_c2 = np.argmax(log_gamma_c2[:,bidx[0]])
-#             if ( ((res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5)) ^ ((pred_c1 < n_states) == (pred_c2 < n_states)) ):
-#                 whether_switch = True
-#             if not whether_switch:
-#                 switch_log_gamma_c1 = log_gamma_c2
-#                 switch_log_gamma_c2 = log_gamma_c1
-#             else:
-#                 switch_log_gamma_c1 = np.vstack([log_gamma_c2[:n_states,:], log_gamma_c2[n_states:,:]])
-#                 switch_log_gamma_c2 = np.vstack([log_gamma_c1[:n_states,:], log_gamma_c1[n_states:,:]])
-#         else:
-#             switch_log_gamma_c1 = log_gamma_c2
-#             switch_log_gamma_c2 = log_gamma_c1
-#         llf_switch = 0.5 * np.append(scipy.special.logsumexp((log_emission_rdr_c1+log_emission_baf_c1)[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0), \
-#                                      scipy.special.logsumexp((log_emission_rdr_c2+log_emission_baf_c2)[:, bidx] + switch_log_gamma_c2[:, bidx], axis=0))
-#         # log likelihood difference
-#         return np.mean(llf_original) - np.mean(llf_switch)
-#     #
-#     if tumor_prop is None:
-#         log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-#             base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-#             total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"])
-#     else:
-#         log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \
-#             base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \
-#             total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop)
-#     log_emission_rdr = log_emission_rdr.reshape((log_emission_rdr.shape[0], n_obs, n_clones), order="F")
-#     log_emission_baf = log_emission_baf.reshape((log_emission_baf.shape[0], n_obs, n_clones), order="F")
-#     reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1))
-#     reshaped_pred_cnv = reshaped_pred % n_states
-#     reshaped_log_gamma = np.stack([ res["log_gamma"][:,(c*n_obs):(c*n_obs + n_obs)] for c in range(n_clones) ], axis=-1)
-#     for c1 in range(n_clones):
-#         for c2 in range(c1+1, n_clones):
-#             # unmergeable_bincount = 0
-#             unique_pair_states = [x for x in np.unique(reshaped_pred_cnv[np.array([c1,c2]), :], axis=1).T if x[0] != x[1]]
-#             list_t_neymanpearson = []
-#             for p in unique_pair_states:
-#                 bidx = np.where( (reshaped_pred_cnv[c1,:]==p[0]) & (reshaped_pred_cnv[c2,:]==p[1]) )[0]
-#                 if "m" in params and "p" in params:
-#                     t_neymanpearson = eval_neymanpearson_rdrbaf(log_emission_rdr[:,:,c1], log_emission_baf[:,:,c1], reshaped_log_gamma[:,:,c1], log_emission_rdr[:,:,c2], log_emission_baf[:,:,c2], reshaped_log_gamma[:,:,c2], bidx, n_states, res, p)
-#                 elif "p" in params:
-#                     t_neymanpearson = eval_neymanpearson_bafonly(log_emission_baf[:,:,c1], reshaped_log_gamma[:,:,c1], log_emission_baf[:,:,c2], reshaped_log_gamma[:,:,c2], bidx, n_states, res, p)
-#                 # if t_neymanpearson > threshold:
-#                 #     unmergeable_bincount += len(bidx)
-#                 print(c1, c2, p, len(bidx), t_neymanpearson)
-#                 if len(bidx) >= minlength:
-#                     list_t_neymanpearson.append(t_neymanpearson)
-#             if len(list_t_neymanpearson) == 0 or np.max(list_t_neymanpearson) < threshold:
-#                 max_v = np.max(list_t_neymanpearson) if len(list_t_neymanpearson) > 0 else 1e-3
-#                 G.add_weighted_edges_from([ (c1, c2, max_v) ])
-#             # if unmergeable_bincount < topk:
-#             #     G.add_edge(c1, c2)
-#     # maximal cliques
-#     cliques = []
-#     for x in nx.find_cliques(G):
-#         this_len = len(x)
-#         this_weights = np.sum([G.get_edge_data(a,b)["weight"] for a in x for b in x if a != b]) / 2
-#         cliques.append( (x, this_len, this_weights) )
-#     cliques.sort(key = lambda x:(-x[1],x[2]) )
-#     covered_nodes = set()
-#     merging_groups = []
-#     for c in cliques:
-#         if len(set(c[0]) & covered_nodes) == 0:
-#             merging_groups.append( list(c[0]) )
-#             covered_nodes = covered_nodes | set(c[0])
-#     for c in range(n_clones):
-#         if not (c in covered_nodes):
-#             merging_groups.append( [c] )
-#             covered_nodes.add(c)
-#     merging_groups.sort(key = lambda x:np.min(x))
-#     # clone assignment after merging
-#     map_clone_id = {}
-#     for i,x in enumerate(merging_groups):
-#         for z in x:
-#             map_clone_id[z] = i
-#     new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
-#     merged_res = copy.copy(res)
-#     merged_res["new_assignment"] = new_assignment
-#     merged_res["total_llf"] = np.NAN
-#     merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
-#     merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
-#     return merging_groups, merged_res
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 04f32bf..620a59a 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -801,7 +801,7 @@ def hmrf_pipeline(
 
         if "mp" in params:
             logger.info(
-                "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
+                "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_log_mu - res["new_log_mu"])),
@@ -810,7 +810,7 @@ def hmrf_pipeline(
             )
         elif "m" in params:
             logger.info(
-                "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
+                "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_log_mu - res["new_log_mu"])),
@@ -818,7 +818,7 @@ def hmrf_pipeline(
             )
         elif "p" in params:
             logger.info(
-                "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
+                "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_p_binom - res["new_p_binom"])),
@@ -826,7 +826,7 @@ def hmrf_pipeline(
             )
 
         logger.info(
-            "outer iteration {}: ARI between assignment = {}".format(
+            "Outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
@@ -1631,7 +1631,7 @@ def hmrfmix_pipeline(
         # update last parameter
         if "mp" in params:
             print(
-                "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
+                "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_log_mu - res["new_log_mu"])),
@@ -1640,7 +1640,7 @@ def hmrfmix_pipeline(
             )
         elif "m" in params:
             print(
-                "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
+                "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_log_mu - res["new_log_mu"])),
@@ -1648,18 +1648,18 @@ def hmrfmix_pipeline(
             )
         elif "p" in params:
             print(
-                "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
+                "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_p_binom - res["new_p_binom"])),
                 )
             )
         print(
-            "outer iteration {}: ARI between assignment = {}".format(
+            "Outer iteration {}: ARI between assignment = {}".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
-        # if np.all( last_assignment == res["new_assignment"] ):
+
         if (
             adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
             or len(np.unique(res["new_assignment"])) == 1

From 32c4239289ef212378b661c76eae9bff48dcb7c9 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:41:02 -0400
Subject: [PATCH 049/125] add use defaults statement

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index ab60265..caa94d0 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -338,35 +338,42 @@ def run_baum_welch_nb_bb(
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
+        
         assert n_comp == 2
         
-        logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse.")
-
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
             if init_log_mu is None
             else init_log_mu
         )
+        
         p_binom = (
             np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T
             if init_p_binom is None
             else init_p_binom
         )
+        
         # NB initialize (inverse of) dispersion param in NB and BetaBinom
         alphas = (
             0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
         )
+        
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
 
+        use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) 
+        
+        logger.info("Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).")
+        
         # NB initialize start probability and emission probability
         log_startprob = np.log(np.ones(n_states) / n_states)
+        
         if n_states > 1:
             transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
             log_transmat = np.zeros((1, 1))
-        # initialize log_gamma
+            
         log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None
 
         # NB a trick to speed up BetaBinom optimization: taking only unique

From 346376916b44a631d8b5cf94af198bbdae6de0f5 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:50:54 -0400
Subject: [PATCH 050/125] edit logging for ARI and bb diff.

---
 src/calicost/hmrf.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 620a59a..994f75c 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -727,7 +727,7 @@ def hmrf_pipeline(
             # NB clone assignmment
             if nodepotential == "max":
                 logger.info(
-                    "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment."
+                    "Assigning HMRF clone for iteration {r} with nodepotential=max & aggr_hmrfmix_reassignment."
                 )
 
                 new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(
@@ -746,7 +746,7 @@ def hmrf_pipeline(
                 )
             elif nodepotential == "weighted_sum":
                 logger.info(
-                    "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior."
+                    "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior."
                 )
 
                 new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(
@@ -818,7 +818,7 @@ def hmrf_pipeline(
             )
         elif "p" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
+                "Outer iteration {}: total_llf = {}, BetaBinom parameters mean abs. diff. = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_p_binom - res["new_p_binom"])),
@@ -826,7 +826,7 @@ def hmrf_pipeline(
             )
 
         logger.info(
-            "Outer iteration {}: ARI between assignment = {}".format(
+            "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
@@ -1017,7 +1017,7 @@ def hmrf_concatenate_pipeline(
             # NB HMRF clone assignmment
             if nodepotential == "max":
                 logger.info(
-                    "Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate."
+                    "Assigning HMRF clone for iteration {r} with nodepotential=max & aggr_hmrf_reassignment_concatenate."
                 )
 
                 new_assignment, single_llf, total_llf = (
@@ -1038,7 +1038,7 @@ def hmrf_concatenate_pipeline(
                 )
             elif nodepotential == "weighted_sum":
                 logger.info(
-                    "Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate."
+                    "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate."
                 )
 
                 new_assignment, single_llf, total_llf = (
@@ -1118,13 +1118,13 @@ def hmrf_concatenate_pipeline(
             )
         elif "p" in params:
             logger.info(
-                "outer iteration {}: difference between BetaBinom parameters = {}".format(
+                "outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format(
                     r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
                 )
             )
 
         logger.info(
-            "outer iteration {}: ARI between assignment = {}".format(
+            "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
@@ -1630,7 +1630,7 @@ def hmrfmix_pipeline(
 
         # update last parameter
         if "mp" in params:
-            print(
+            logger.info(
                 "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
                     r,
                     res["total_llf"],
@@ -1639,7 +1639,7 @@ def hmrfmix_pipeline(
                 )
             )
         elif "m" in params:
-            print(
+            logger.info(
                 "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
                     r,
                     res["total_llf"],
@@ -1647,15 +1647,15 @@ def hmrfmix_pipeline(
                 )
             )
         elif "p" in params:
-            print(
-                "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format(
+            logger.info(
+                "Outer iteration {}: total_llf = {}, BetaBinom mean abs. diff. = {}".format(
                     r,
                     res["total_llf"],
                     np.mean(np.abs(last_p_binom - res["new_p_binom"])),
                 )
             )
-        print(
-            "Outer iteration {}: ARI between assignment = {}".format(
+        logger.info(
+            "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )
@@ -2111,7 +2111,7 @@ def hmrfmix_concatenate_pipeline(
             # NB HMRF clone assignmment
             if nodepotential == "max":
                 logger.info(
-                    "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate."
+                    "Assigning HMRF clone for iteration {r} with nodepotential=max & aggr_hmrfmix_reassignment_concatenate."
                 )
 
                 new_assignment, single_llf, total_llf = (
@@ -2133,7 +2133,7 @@ def hmrfmix_concatenate_pipeline(
                 )
             elif nodepotential == "weighted_sum":
                 logger.info(
-                    "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate."
+                    "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate."
                 )
 
                 new_assignment, single_llf, total_llf = (
@@ -2217,13 +2217,13 @@ def hmrfmix_concatenate_pipeline(
             )
         elif "p" in params:
             logger.info(
-                "outer iteration {}: difference between BetaBinom parameters = {}".format(
+                "Outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format(
                     r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
                 )
             )
 
         logger.info(
-            "outer iteration {}: ARI between assignment = {}".format(
+            "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
                 r, adjusted_rand_score(last_assignment, res["new_assignment"])
             )
         )

From b6a95ad7e3e234e212ea74eea3b763bab0459017 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 14:59:13 -0400
Subject: [PATCH 051/125] log neyman pearson.

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 35 ++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index d2683a4..bc9b2a6 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -1236,11 +1236,14 @@ def similarity_components_rdrbaf_neymanpearson(
     n_obs = X.shape[0]
     n_states = res["new_p_binom"].shape[0]
     n_clones = X.shape[2]
+
+    logger.info("Computing similarity_components_rdrbaf_neymanpearson for (n_obs, n_states, n_clones) = ({n_obs}, {n_states}, {n_clones}).")
+    
     G = nx.Graph()
     G.add_nodes_from(np.arange(n_clones))
-    #
+    
     lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean)
-    #
+    
     if tumor_prop is None:
         log_emission_rdr, log_emission_baf = (
             hmmclass.compute_emission_probability_nb_betabinom(
@@ -1312,10 +1315,11 @@ def similarity_components_rdrbaf_neymanpearson(
     )
     reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
+    
     all_test_statistics = []
+    
     for c1 in range(n_clones):
         for c2 in range(c1 + 1, n_clones):
-            # unmergeable_bincount = 0
             unique_pair_states = [
                 x
                 for x in np.unique(reshaped_pred_cnv[np.array([c1, c2]), :], axis=1).T
@@ -1327,6 +1331,7 @@ def similarity_components_rdrbaf_neymanpearson(
                     (reshaped_pred_cnv[c1, :] == p[0])
                     & (reshaped_pred_cnv[c2, :] == p[1])
                 )[0]
+                
                 if "m" in params and "p" in params:
                     t_neymanpearson = eval_neymanpearson_rdrbaf(
                         log_emission_rdr[:, :, c1],
@@ -1351,8 +1356,12 @@ def similarity_components_rdrbaf_neymanpearson(
                         res,
                         p,
                     )
-                print(c1, c2, p, len(bidx), t_neymanpearson)
+
+                # TODO
+                logger.info(f"{c1}, {c2}, {p}, {len(bidx)}, {t_neymanpearson}")
+                
                 all_test_statistics.append([c1, c2, p, t_neymanpearson])
+                
                 if len(bidx) >= minlength:
                     list_t_neymanpearson.append(t_neymanpearson)
             if (
@@ -1365,8 +1374,11 @@ def similarity_components_rdrbaf_neymanpearson(
                     else 1e-3
                 )
                 G.add_weighted_edges_from([(c1, c2, max_v)])
-    # maximal cliques
+
+    logger.info("Computing Maximal cliques.")
+                
     cliques = []
+    
     for x in nx.find_cliques(G):
         this_len = len(x)
         this_weights = (
@@ -1374,23 +1386,31 @@ def similarity_components_rdrbaf_neymanpearson(
             / 2
         )
         cliques.append((x, this_len, this_weights))
+        
     cliques.sort(key=lambda x: (-x[1], x[2]))
+    
     covered_nodes = set()
     merging_groups = []
+    
     for c in cliques:
         if len(set(c[0]) & covered_nodes) == 0:
             merging_groups.append(list(c[0]))
             covered_nodes = covered_nodes | set(c[0])
+            
     for c in range(n_clones):
         if not (c in covered_nodes):
             merging_groups.append([c])
             covered_nodes.add(c)
+            
     merging_groups.sort(key=lambda x: np.min(x))
-    # clone assignment after merging
+    
+    # NB clone assignment after merging
     map_clone_id = {}
+    
     for i, x in enumerate(merging_groups):
         for z in x:
             map_clone_id[z] = i
+            
     new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
     merged_res = copy.copy(res)
     merged_res["new_assignment"] = new_assignment
@@ -1407,6 +1427,9 @@ def similarity_components_rdrbaf_neymanpearson(
             for c in merging_groups
         ]
     )
+
+    logger.info("Computed similarity_components_rdrbaf_neymanpearson.")
+    
     return merging_groups, merged_res
 
 

From 79796c63258b99d9f9857f44b058cabedd8bb0eb Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:05:09 -0400
Subject: [PATCH 052/125] improved logging.

---
 src/calicost/calicost_main.py |  1 +
 src/calicost/utils_IO.py      | 45 +++++++++++++++++------------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 9e52821..12be38d 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -288,6 +288,7 @@ def main(configuration_file):
                 threshold=config["tumorprop_threshold"],
             )
 
+        # TODO merging groups?
         logger.info(
             f"BAF clone merging after requiring minimum # spots: {merging_groups}"
         )
diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index d570a84..2f0ee47 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -1713,21 +1713,26 @@ def filter_de_genes_tri(
     df_bininfo : pd.DataFrame
         Contains columns ['CHR', 'START', 'END', 'INCLUDED_GENES', 'INCLUDED_SNP_IDS'], 'INCLUDED_GENES' contains space-delimited gene names.
     """
+
+    logger.info("Computing filter_de_genes_tri.")
+    
     adata = anndata.AnnData(exp_counts)
     adata.layers["count"] = exp_counts.values
     adata.obs["normal_candidate"] = normal_candidate
-    #
+    
     map_gene_adatavar = {}
     map_gene_umi = {}
     list_gene_umi = np.sum(adata.layers["count"], axis=0)
+    
     for i, x in enumerate(adata.var.index):
         map_gene_adatavar[x] = i
         map_gene_umi[x] = list_gene_umi[i]
-    #
+    
     if sample_list is None:
         sample_list = [None]
-    #
+    
     filtered_out_set = set()
+    
     for s, sname in enumerate(sample_list):
         if sname is None:
             index = np.arange(adata.shape[0])
@@ -1739,19 +1744,19 @@ def filter_de_genes_tri(
             < tmpadata.shape[1] * 10
         ):
             continue
-        #
+        
         umi_threshold = np.percentile(
             np.sum(tmpadata.layers["count"], axis=0), quantile_threshold
         )
-        #
-        # sc.pp.filter_cells(tmpadata, min_genes=200)
+        
         sc.pp.filter_genes(tmpadata, min_cells=10)
         med = np.median(np.sum(tmpadata.layers["count"], axis=1))
-        # sc.pp.normalize_total(tmpadata, target_sum=1e4)
+        
         sc.pp.normalize_total(tmpadata, target_sum=med)
         sc.pp.log1p(tmpadata)
-        # new added
+        
         sc.pp.pca(tmpadata, n_comps=4)
+
         kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"])
         kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"])
         idx_kmeans_label = np.argmax(
@@ -1761,23 +1766,13 @@ def filter_de_genes_tri(
         clone[
             (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"])
         ] = "tumor"
+        
         ### third part ###
         clone[
             (kmeans_labels == idx_kmeans_label) & (~tmpadata.obs["normal_candidate"])
         ] = "unsure"
         tmpadata.obs["clone"] = clone
-        # end added
-        # sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor", "unsure"], reference="normal", method='wilcoxon')
-        # # DE and log fold change comparing tumor and normal
-        # genenames_t = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["names"] ])
-        # logfc_t = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ])
-        # geneumis_t = np.array([ map_gene_umi[x] for x in genenames_t])
-        # # DE and log fold change comparing unsure and normal
-        # genenames_u = np.array([ x[1] for x in tmpadata.uns["rank_genes_groups"]["names"] ])
-        # logfc_u = np.array([ x[1] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ])
-        # geneumis_u = np.array([ map_gene_umi[x] for x in genenames_u])
-        # this_filtered_out_set = set(list(genenames_t[ (np.abs(logfc_t) > logfcthreshold) & (geneumis_t > umi_threshold) ])) | set(list(genenames_u[ (np.abs(logfc_u) > logfcthreshold) & (geneumis_u > umi_threshold) ]))
-        #
+
         agg_counts = np.vstack(
             [
                 np.sum(tmpadata.layers["count"][tmpadata.obs["clone"] == c, :], axis=0)
@@ -1810,10 +1805,12 @@ def filter_de_genes_tri(
             )
         )
         filtered_out_set = filtered_out_set | this_filtered_out_set
-        print(f"Filter out {len(filtered_out_set)} DE genes")
-    #
-    # remove genes that are in filtered_out_set
+        
+        logger.info(f"Filtered {len(filtered_out_set)} differentially expressed genes.")
+
+    # NB remove genes that are in filtered_out_set
     new_single_X_rdr = np.zeros((df_bininfo.shape[0], adata.shape[0]))
+    
     for b, genestr in enumerate(df_bininfo.INCLUDED_GENES.values):
         # RDR (genes)
         involved_genes = set(genestr.split(" ")) - filtered_out_set
@@ -1821,6 +1818,8 @@ def filter_de_genes_tri(
             adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1
         )
 
+    logger.info("Computed filter_de_genes_tri.")
+        
     return new_single_X_rdr, filtered_out_set
 
 

From 80f9c21ad16eaedda41545116f20d143be635140 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:10:02 -0400
Subject: [PATCH 053/125] update logging.

---
 src/calicost/hmrf.py      |  2 +-
 src/calicost/utils_hmm.py | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 994f75c..baf00f6 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1967,7 +1967,7 @@ def hmrfmix_concatenate_pipeline(
     spatial_weight=1.0 / 6,
     tumorprop_threshold=0.5,
 ):
-    logger.info("Solving hmrfix_concatenate_pipeline.")
+    logger.info("Solving hmrfmix_concatenate_pipeline.")
 
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 95be54e..d16193a 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -1810,13 +1810,15 @@ def update_emission_params_bb_nophasing_uniqvalues(
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
-    # initialization
+    
+    # NB initialization
     new_p_binom = (
         copy.copy(start_p_binom)
         if not start_p_binom is None
         else np.ones((n_states, n_spots)) * 0.5
     )
     new_taus = copy.copy(taus)
+    
     if fix_BB_dispersion:
         for s in np.arange(len(unique_values)):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
@@ -1892,6 +1894,7 @@ def update_emission_params_bb_nophasing_uniqvalues(
             weights = []
             features = []
             state_posweights = []
+            
             for s in np.arange(len(unique_values)):
                 idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
@@ -1923,18 +1926,25 @@ def update_emission_params_bb_nophasing_uniqvalues(
                     this_features[idx_row_posweight, :][:, idx_state_posweight]
                 )
                 state_posweights.append(idx_state_posweight)
+                
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
+
+            
+            
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_taus[:, :] = res.params[-1]
+                
             if not (start_p_binom is None):
                 res2 = model.fit(
                     disp=0,
@@ -1949,11 +1959,13 @@ def update_emission_params_bb_nophasing_uniqvalues(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
+                
                 if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_taus[:, :] = res2.params[-1]
 
@@ -1995,7 +2007,8 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
-    # initialization
+    
+    # NB initialization
     new_p_binom = (
         copy.copy(start_p_binom)
         if not start_p_binom is None

From bc446230a8cac0368bab00368d1d56d4e9dc2c25 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:15:50 -0400
Subject: [PATCH 054/125] update logging

---
 src/calicost/utils_hmm.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index d16193a..3e4ecf5 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -475,6 +475,7 @@ def update_emission_params_nb_sitewise_uniqvalues(
         if not start_log_mu is None
         else np.zeros((n_states, n_spots))
     )
+    
     new_alphas = copy.copy(alphas)
 
     # expression signal by NB distribution
@@ -550,6 +551,7 @@ def update_emission_params_nb_sitewise_uniqvalues(
             weights = []
             features = []
             state_posweights = []
+            
             for s in range(n_spots):
                 idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states)
@@ -585,21 +587,31 @@ def update_emission_params_nb_sitewise_uniqvalues(
                     this_features[idx_row_posweight, :][:, idx_state_posweight]
                 )
                 state_posweights.append(idx_state_posweight)
+                
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
+            
             model = Weighted_NegativeBinomial(
                 y, features, weights=weights, exposure=exposure
             )
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+
+            logger.info("Applying fit with default start params.")
+            
+            res = model.fit(disp=0, maxiter=1_500, xtol=1.e-4, ftol=1.e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_alphas[:, :] = res.params[-1]
+                
             if not (start_log_mu is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -614,8 +626,6 @@ def update_emission_params_nb_sitewise_uniqvalues(
                     ftol=1e-4,
                 )
 
-                logger.info(f"")
-
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
@@ -626,8 +636,10 @@ def update_emission_params_nb_sitewise_uniqvalues(
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_alphas[:, :] = res2.params[-1]
+                        
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
 

From 64c691df06469c9b7ea8460fbddd7c37db35daa9 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:25:49 -0400
Subject: [PATCH 055/125] improve logging

---
 src/calicost/utils_hmm.py | 102 +++++++++++++++++++++++++++++++++-----
 1 file changed, 89 insertions(+), 13 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 3e4ecf5..0406cfb 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -679,13 +679,16 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
 
     logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.")
     
-    # initialization
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None
         else np.zeros((n_states, n_spots))
     )
+    
     new_alphas = copy.copy(alphas)
+
+    
+    
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
@@ -809,12 +812,13 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                 )
                 state_posweights.append(idx_state_posweight)
                 tp.append(this_tp[idx_row_posweight])
-                # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
+    
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
+            
             model = Weighted_NegativeBinomial_mix(
                 y,
                 features,
@@ -823,14 +827,22 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                 tumor_prop=tp,
                 penalty=0,
             )
+
+            logger.info("Applying fit with default start params.")
+            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_alphas[:, :] = res.params[-1]
+                
             if not (start_log_mu is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -844,13 +856,22 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
+                        
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_alphas[:, :] = res2.params[-1]
+                        
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
 
@@ -890,13 +911,14 @@ def update_emission_params_bb_sitewise_uniqvalues(
 
     logger.info("Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
     
-    # initialization
     new_p_binom = (
         copy.copy(start_p_binom)
         if not start_p_binom is None
         else np.ones((n_states, n_spots)) * 0.5
     )
+    
     new_taus = copy.copy(taus)
+    
     if fix_BB_dispersion:
         for s in np.arange(len(unique_values)):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
@@ -1044,19 +1066,29 @@ def update_emission_params_bb_sitewise_uniqvalues(
                     this_features[idx_row_posweight, :][:, idx_state_posweight]
                 )
                 state_posweights.append(idx_state_posweight)
+                
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
+            
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
-            res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+
+            logger.info("Applying fit with default start params.")
+            
+            res = model.fit(disp=0, maxiter=1_500, xtol=1e-4, ftol=1e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_taus[:, :] = res.params[-1]
+                
             if not (start_p_binom is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1070,13 +1102,21 @@ def update_emission_params_bb_sitewise_uniqvalues(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_taus[:, :] = res2.params[-1]
+                        
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
 
@@ -1117,13 +1157,14 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
 
     logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.")
     
-    # initialization
     new_p_binom = (
         copy.copy(start_p_binom)
         if not start_p_binom is None
         else np.ones((n_states, n_spots)) * 0.5
     )
+    
     new_taus = copy.copy(taus)
+    
     if fix_BB_dispersion:
         for s in np.arange(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
@@ -1303,17 +1344,26 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
+            
             model = Weighted_BetaBinom_mix(
                 y, features, weights=weights, exposure=exposure, tumor_prop=tp
             )
+
+            logger.info("Applying fit with default start params.")
+            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_taus[:, :] = res.params[-1]
+                
             if not (start_p_binom is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1327,13 +1377,21 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_taus[:, :] = res2.params[-1]
+                        
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
 
@@ -1428,18 +1486,20 @@ def update_emission_params_nb_nophasing_uniqvalues(
         Mean expression under diploid state.
     """
 
-    logger.info("Computing emission params for Negative Binomial (no phasing, unique).")
+    logger.info("Computing emission params for Negative Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.")
 
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
-    # initialization
+
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None
         else np.zeros((n_states, n_spots))
     )
+    
     new_alphas = copy.copy(alphas)
+    
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
@@ -1544,21 +1604,31 @@ def update_emission_params_nb_nophasing_uniqvalues(
                     this_features[idx_row_posweight, :][:, idx_state_posweight]
                 )
                 state_posweights.append(idx_state_posweight)
+                
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
+            
             model = Weighted_NegativeBinomial(
                 y, features, weights=weights, exposure=exposure
             )
+
+            logger.info("Applying fit with default start params.")
+            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_alphas[:, :] = res.params[-1]
+                
             if not (start_log_mu is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1572,13 +1642,19 @@ def update_emission_params_nb_nophasing_uniqvalues(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_alphas[:, :] = res2.params[-1]
+                        
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
 
@@ -1611,12 +1687,12 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
-    logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique).")
+    logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
 
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
-    # initialization
+
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None

From dddc8c9fd2f49d0af8d0b1dfba1ed96319737dfc Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:31:43 -0400
Subject: [PATCH 056/125] improved logging

---
 src/calicost/utils_hmm.py | 72 ++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 0406cfb..e2e4f8e 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -686,9 +686,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
     )
     
     new_alphas = copy.copy(alphas)
-
-    
-    
+   
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
@@ -1019,6 +1017,7 @@ def update_emission_params_bb_sitewise_uniqvalues(
             weights = []
             features = []
             state_posweights = []
+            
             for s in np.arange(len(unique_values)):
                 idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0]
                 this_exposure = np.tile(
@@ -1698,7 +1697,9 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
         if not start_log_mu is None
         else np.zeros((n_states, n_spots))
     )
+    
     new_alphas = copy.copy(alphas)
+    
     # expression signal by NB distribution
     if fix_NB_dispersion:
         new_log_mu = np.zeros((n_states, n_spots))
@@ -1819,12 +1820,13 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 )
                 state_posweights.append(idx_state_posweight)
                 tp.append(this_tp[idx_row_posweight])
-                # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
+
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
+            
             model = Weighted_NegativeBinomial_mix(
                 y,
                 features,
@@ -1833,14 +1835,22 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 tumor_prop=tp,
                 penalty=0,
             )
+
+            logger.info("Applying fit with default start params.")
+            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_log_mu[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_alphas[:, :] = res.params[-1]
+                
             if not (start_log_mu is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1854,13 +1864,21 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_log_mu[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_alphas[:, :] = res2.params[-1]
+                        
     new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr
     new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr
 
@@ -1893,7 +1911,7 @@ def update_emission_params_bb_nophasing_uniqvalues(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info("Computing emission params for Beta Binomial (no phasing, unique).")
+    logger.info("Computing emission params for Beta Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.")
 
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
@@ -1905,6 +1923,7 @@ def update_emission_params_bb_nophasing_uniqvalues(
         if not start_p_binom is None
         else np.ones((n_states, n_spots)) * 0.5
     )
+    
     new_taus = copy.copy(taus)
     
     if fix_BB_dispersion:
@@ -2020,9 +2039,10 @@ def update_emission_params_bb_nophasing_uniqvalues(
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
 
-            
-            
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
+
+            logger.info("Applying fit with default start params.")
+            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
             
             for s, idx_state_posweight in enumerate(state_posweights):
@@ -2032,8 +2052,10 @@ def update_emission_params_bb_nophasing_uniqvalues(
                 
             if res.params[-1] > 0:
                 new_taus[:, :] = res.params[-1]
-                
+
             if not (start_p_binom is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -2047,8 +2069,13 @@ def update_emission_params_bb_nophasing_uniqvalues(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
                 
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
@@ -2090,7 +2117,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique).")
+    logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
 
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
@@ -2102,7 +2129,9 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
         if not start_p_binom is None
         else np.ones((n_states, n_spots)) * 0.5
     )
+    
     new_taus = copy.copy(taus)
+    
     if fix_BB_dispersion:
         for s in np.arange(n_spots):
             tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A
@@ -2235,23 +2264,32 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                 )
                 state_posweights.append(idx_state_posweight)
                 tp.append(this_tp[idx_row_posweight])
-                # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) )
+    
             exposure = np.concatenate(exposure)
             y = np.concatenate(y)
             weights = np.concatenate(weights)
             features = scipy.linalg.block_diag(*features)
             tp = np.concatenate(tp)
+            
             model = Weighted_BetaBinom_mix(
                 y, features, weights=weights, exposure=exposure, tumor_prop=tp
             )
+
+            logger.info("Applying fit with default start params.")
+            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
+            
             for s, idx_state_posweight in enumerate(state_posweights):
                 l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                 l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                 new_p_binom[idx_state_posweight, s] = res.params[l1:l2]
+                
             if res.params[-1] > 0:
                 new_taus[:, :] = res.params[-1]
+                
             if not (start_p_binom is None):
+                logger.info("Applying fit with custom start params.")
+                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -2265,13 +2303,21 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                     xtol=1e-4,
                     ftol=1e-4,
                 )
-                if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params):
+
+                nloglikeobs2 = model.nloglikeobs(res2.params)
+                nloglikeobs = model.nloglikeobs(res.params)
+
+                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                
+                if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
                         l1 = int(np.sum([len(x) for x in state_posweights[:s]]))
                         l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]]))
                         new_p_binom[idx_state_posweight, s] = res2.params[l1:l2]
+                        
                     if res2.params[-1] > 0:
                         new_taus[:, :] = res2.params[-1]
+                        
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
 

From e807de09574738b7b1bf53ac710b43d8bb52f6bd Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:38:31 -0400
Subject: [PATCH 057/125] fix logging

---
 src/calicost/hmrf.py | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index baf00f6..7e11b3e 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -801,34 +801,19 @@ def hmrf_pipeline(
 
         if "mp" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
-                    r,
-                    res["total_llf"],
-                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
-                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
-                )
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
         elif "m" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
-                    r,
-                    res["total_llf"],
-                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
-                )
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}"
             )
         elif "p" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, BetaBinom parameters mean abs. diff. = {}".format(
-                    r,
-                    res["total_llf"],
-                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
-                )
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}"
             )
 
         logger.info(
-            "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
-                r, adjusted_rand_score(last_assignment, res["new_assignment"])
-            )
+            f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)"
         )
 
         if (

From d1ae926da41e2aa3c7f3619f6f19d032c12fd646 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:41:08 -0400
Subject: [PATCH 058/125] edit logging

---
 src/calicost/hmrf.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 7e11b3e..78e8e42 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -2188,31 +2188,21 @@ def hmrfmix_concatenate_pipeline(
 
         if "mp" in params:
             logger.info(
-                "outer iteration {}: difference between parameters = {}, {}".format(
-                    r,
-                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
-                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
-                )
+                f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res["new_log_mu"]))}, {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}"
             )
         elif "m" in params:
             logger.info(
-                "outer iteration {}: difference between NB parameters = {}".format(
-                    r, np.mean(np.abs(last_log_mu - res["new_log_mu"]))
-                )
+                f"Outer iteration {r}: mean abs. diff. between NB parameters = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}"
             )
         elif "p" in params:
             logger.info(
-                "Outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format(
-                    r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
-                )
+                f"Outer iteration {r}: BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
 
         logger.info(
-            "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
-                r, adjusted_rand_score(last_assignment, res["new_assignment"])
-            )
+            f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)"
         )
-        # if np.all( last_assignment == res["new_assignment"] ):
+
         if (
             adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
             or len(np.unique(res["new_assignment"])) == 1

From 3501640613f77bf81c17dc4d71743837f93c78eb Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 15:46:53 -0400
Subject: [PATCH 059/125] fix logging

---
 src/calicost/hmrf.py | 46 ++++++++++++--------------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 78e8e42..8562e36 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1089,31 +1089,21 @@ def hmrf_concatenate_pipeline(
 
         if "mp" in params:
             logger.info(
-                "outer iteration {}: difference between parameters = {}, {}".format(
-                    r,
-                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
-                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
-                )
+                f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
         elif "m" in params:
             logger.info(
-                "outer iteration {}: difference between NB parameters = {}".format(
-                    r, np.mean(np.abs(last_log_mu - res["new_log_mu"]))
-                )
+                f"Outer iteration {r}: mean abs. diff. (mu) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}"
             )
         elif "p" in params:
             logger.info(
-                "outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format(
-                    r, np.mean(np.abs(last_p_binom - res["new_p_binom"]))
-                )
+                f"Outer iteration {r}: mean abs. diff. (p) = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
 
         logger.info(
-            "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
-                r, adjusted_rand_score(last_assignment, res["new_assignment"])
-            )
+            f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)"
         )
-        # if np.all( last_assignment == res["new_assignment"] ):
+
         if (
             adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99
             or len(np.unique(res["new_assignment"])) == 1
@@ -1616,33 +1606,19 @@ def hmrfmix_pipeline(
         # update last parameter
         if "mp" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format(
-                    r,
-                    res["total_llf"],
-                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
-                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
-                )
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
         elif "m" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format(
-                    r,
-                    res["total_llf"],
-                    np.mean(np.abs(last_log_mu - res["new_log_mu"])),
-                )
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}"
             )
         elif "p" in params:
             logger.info(
-                "Outer iteration {}: total_llf = {}, BetaBinom mean abs. diff. = {}".format(
-                    r,
-                    res["total_llf"],
-                    np.mean(np.abs(last_p_binom - res["new_p_binom"])),
-                )
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (p) = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
+            
         logger.info(
-            "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format(
-                r, adjusted_rand_score(last_assignment, res["new_assignment"])
-            )
+            f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)"
         )
 
         if (
@@ -1650,12 +1626,14 @@ def hmrfmix_pipeline(
             or len(np.unique(res["new_assignment"])) == 1
         ):
             break
+        
         last_log_mu = res["new_log_mu"]
         last_p_binom = res["new_p_binom"]
         last_alphas = res["new_alphas"]
         last_taus = res["new_taus"]
         last_assignment = res["new_assignment"]
         log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2]))
+        
         for sidx in range(n_samples):
             index = np.where(sample_ids == sidx)[0]
             this_persample_weight = np.bincount(

From cfad5411090a5f70d4f9e4f8fda49d68b856fbb9 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 16:00:26 -0400
Subject: [PATCH 060/125] fix logging

---
 src/calicost/hmrf.py | 48 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 8562e36..8b2de54 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -140,6 +140,8 @@ def hmrf_reassignment_posterior(
         "Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+    
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -240,6 +242,8 @@ def aggr_hmrf_reassignment(
         "Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+    
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -354,6 +358,8 @@ def hmrf_reassignment_posterior_concatenate(
         "Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -493,6 +499,8 @@ def aggr_hmrf_reassignment_concatenate(
         "Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+    
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -789,7 +797,8 @@ def hmrf_pipeline(
             res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz")
 
         logger.info(f"Regrouping to pseudobulk for iteration {r}.")
-
+        logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}")
+        
         clone_index = [
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
@@ -1044,6 +1053,8 @@ def hmrf_concatenate_pipeline(
             else:
                 raise ValueError("Unknown mode for nodepotential!")
 
+            logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}")
+            
             # NB handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
@@ -1233,6 +1244,9 @@ def aggr_hmrfmix_reassignment(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -1363,7 +1377,8 @@ def hmrfmix_reassignment_posterior(
         )
 
     logger.info(f"Computed hmrfmix_reassignment_posterior.")
-
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+    
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -1589,7 +1604,8 @@ def hmrfmix_pipeline(
             allres["num_iterations"] = r + 1
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
 
-        # regroup to pseudobulk
+        logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}")
+
         clone_index = [
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
@@ -1781,7 +1797,9 @@ def hmrfmix_reassignment_posterior_concatenate(
         )
 
     logger.info(f"Computed hmrfmix_reassignment_posterior_concatenate.")
-        
+
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+    
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -1810,11 +1828,11 @@ def aggr_hmrfmix_reassignment_concatenate(
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+    
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
-    #
+    
     posterior = np.zeros((N, n_clones))
-    #
+    
     for i in trange(N):
         idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
@@ -1887,6 +1905,9 @@ def aggr_hmrfmix_reassignment_concatenate(
                 new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i]
             )
         )
+
+    logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
+        
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -2118,6 +2139,8 @@ def hmrfmix_concatenate_pipeline(
             else:
                 raise ValueError("Unknown mode for nodepotential!")
 
+            logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}")
+            
             # NB handle the case when one clone has zero spots
             if len(np.unique(new_assignment)) < X.shape[2]:
                 res["assignment_before_reindex"] = new_assignment
@@ -2129,11 +2152,11 @@ def hmrfmix_concatenate_pipeline(
                 )
                 res["log_gamma"] = res["log_gamma"][:, concat_idx]
                 res["pred_cnv"] = res["pred_cnv"][concat_idx]
-            # add to results
+                
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
-            # append to allres
+            
             for k, v in res.items():
                 if k == "prev_assignment":
                     allres[f"round{r-1}_assignment"] = v
@@ -2141,6 +2164,7 @@ def hmrfmix_concatenate_pipeline(
                     allres[f"round{r}_assignment"] = v
                 else:
                     allres[f"round{r}_{k}"] = v
+                    
             allres["num_iterations"] = r + 1
 
             logger.info(
@@ -2155,6 +2179,7 @@ def hmrfmix_concatenate_pipeline(
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
         ]
+        
         X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
             single_X,
             single_base_nb_mean,
@@ -2248,6 +2273,9 @@ def clonelabel_posterior_withinteger(
 
     spatial_weight : float
     """
+
+    logger.info("Computing clonelabel_posterior_withinteger.")
+    
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     # clone IDs
@@ -2403,4 +2431,6 @@ def clonelabel_posterior_withinteger(
             - scipy.special.logsumexp(w_node + spatial_weight * w_edge)
         )
 
+    logger.info("Computed clonelabel_posterior_withinteger.")
+        
     return df_posterior

From a321ff611b43e6c86c06c142ae42ac91d5c07020 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 19:47:44 -0400
Subject: [PATCH 061/125] fix bugs

---
 src/calicost/hmrf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 8b2de54..6ff29d6 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -818,7 +818,7 @@ def hmrf_pipeline(
             )
         elif "p" in params:
             logger.info(
-                f"Outer iteration {r}: total_llf = {res['total_llf']}, BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}"
+                f"Outer iteration {r}: total_llf = {res['total_llf']}, BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
 
         logger.info(
@@ -2191,7 +2191,7 @@ def hmrfmix_concatenate_pipeline(
 
         if "mp" in params:
             logger.info(
-                f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res["new_log_mu"]))}, {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}"
+                f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}"
             )
         elif "m" in params:
             logger.info(

From d4db18d1a69f864b94419537fc1a76dd4fdf2e14 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 20:03:48 -0400
Subject: [PATCH 062/125] fix bugs

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 104 ++++++++++++++------------
 src/calicost/utils_hmm.py             |   4 +-
 2 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index bc9b2a6..aaae1d7 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -70,7 +70,9 @@ def compute_emission_probability_nb_betabinom(
         log_emission : array, shape (2*n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
-        logger.info("Computing emission probability for negative binomial & beta binomial (sitewise).")
+        logger.info(
+            "Computing emission probability for negative binomial & beta binomial (sitewise)."
+        )
 
         n_obs = X.shape[0]
         n_comp = X.shape[1]
@@ -113,7 +115,9 @@ def compute_emission_probability_nb_betabinom(
                         )
                     )
 
-        logger.info("Computed emission probability for negative binomial & beta binomial (sitewise).")
+        logger.info(
+            "Computed emission probability for negative binomial & beta binomial (sitewise)."
+        )
 
         return log_emission_rdr, log_emission_baf
 
@@ -158,7 +162,9 @@ def compute_emission_probability_nb_betabinom_mix(
         log_emission : array, shape (2*n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
-        logger.info("Computing emission probability for *mixed* negative binomial & beta binomial (sitewise).")
+        logger.info(
+            "Computing emission probability for *mixed* negative binomial & beta binomial (sitewise)."
+        )
 
         n_obs = X.shape[0]
         n_comp = X.shape[1]
@@ -211,7 +217,9 @@ def compute_emission_probability_nb_betabinom_mix(
                         mix_p_A * taus[i, s],
                     )
 
-        logger.info("Computed emission probability for *mixed* negative binomial & beta binomial (sitewise).")
+        logger.info(
+            "Computed emission probability for *mixed* negative binomial & beta binomial (sitewise)."
+        )
 
         return log_emission_rdr, log_emission_baf
 
@@ -381,7 +389,9 @@ def run_baum_welch_nb_bb(
         n_spots = X.shape[2]
         assert n_comp == 2
 
-        logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise).")
+        logger.info(
+            "Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise)."
+        )
 
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
@@ -409,7 +419,7 @@ def run_baum_welch_nb_bb(
         else:
             log_transmat = np.zeros((1, 1))
 
-        # NB a trick to speed up BetaBinom optimization: taking only unique values of 
+        # NB a trick to speed up BetaBinom optimization: taking only unique values of
         #   (B allele count, total SNP covering read count)
         unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
             X[:, 0, :], base_nb_mean
@@ -419,7 +429,9 @@ def run_baum_welch_nb_bb(
         )
 
         for r in trange(max_iter, desc="EM algorithm (sitewise)"):
-            logger.info(f"Calculating E-step (sitewise) for iteration {r} of {max_iter}.")
+            logger.info(
+                f"Calculating E-step (sitewise) for iteration {r} of {max_iter}."
+            )
 
             if tumor_prop is None:
                 log_emission_rdr, log_emission_baf = (
@@ -465,7 +477,9 @@ def run_baum_welch_nb_bb(
                 log_alpha, log_beta, log_transmat, log_emission
             )
 
-            logger.info(f"Calculating M-step (sitewise) for iteration {r} of {max_iter}.")
+            logger.info(
+                f"Calculating M-step (sitewise) for iteration {r} of {max_iter}."
+            )
 
             if "s" in self.params:
                 new_log_startprob = update_startprob_sitewise(lengths, log_gamma)
@@ -540,23 +554,20 @@ def run_baum_welch_nb_bb(
             else:
                 new_p_binom = p_binom
                 new_taus = taus
-            # check convergence
+
             logger.info(
-                "EM convergence metrics (sitewise)",
-                np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))),
-                np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))),
-                np.mean(np.abs(new_log_mu - log_mu)),
-                np.mean(np.abs(new_p_binom - p_binom)),
+                f"EM convergence metrics (sitewise): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}"
             )
-            
-            logger.info(np.hstack([new_log_mu, new_p_binom]))
-            
+
+            # logger.info(np.hstack([new_log_mu, new_p_binom]))
+
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
                 and np.mean(np.abs(new_p_binom - p_binom)) < tol
             ):
                 break
+            
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
             log_mu = new_log_mu
@@ -566,6 +577,9 @@ def run_baum_welch_nb_bb(
 
         logger.info("Computed Baum-Welch (sitewise).")
 
+        logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
+        logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
+        
         return (
             new_log_mu,
             new_alphas,
@@ -860,15 +874,9 @@ def pipeline_baum_welch(
         if (init_p_binom is None) and ("p" in params):
             init_p_binom = tmp_p_binom
 
-    logger.info(f"Initial (mu, p):\n{np.hstack([init_log_mu, init_p_binom])}")
+    logger.info(f"Initial mu:\n{init_log_mu}")
+    logger.info(f"Initial p:\n{init_p_binom}")
 
-    # fit HMM-NB-BetaBinom
-    # new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb(X, lengths, \
-    #     n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop, \
-    #     fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
-    #     fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \
-    #     is_diag=is_diag, init_log_mu=init_log_mu, init_p_binom=init_p_binom, init_alphas=init_alphas, init_taus=init_taus, \
-    #     max_iter=max_iter, tol=tol)
     hmmmodel = hmmclass(params=params, t=t)
     remain_kwargs = {
         k: v for k, v in kwargs.items() if k in ["lambd", "sample_length", "log_gamma"]
@@ -1237,13 +1245,15 @@ def similarity_components_rdrbaf_neymanpearson(
     n_states = res["new_p_binom"].shape[0]
     n_clones = X.shape[2]
 
-    logger.info("Computing similarity_components_rdrbaf_neymanpearson for (n_obs, n_states, n_clones) = ({n_obs}, {n_states}, {n_clones}).")
-    
+    logger.info(
+        "Computing similarity_components_rdrbaf_neymanpearson for (n_obs, n_states, n_clones) = ({n_obs}, {n_states}, {n_clones})."
+    )
+
     G = nx.Graph()
     G.add_nodes_from(np.arange(n_clones))
-    
+
     lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean)
-    
+
     if tumor_prop is None:
         log_emission_rdr, log_emission_baf = (
             hmmclass.compute_emission_probability_nb_betabinom(
@@ -1315,9 +1325,9 @@ def similarity_components_rdrbaf_neymanpearson(
     )
     reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
-    
+
     all_test_statistics = []
-    
+
     for c1 in range(n_clones):
         for c2 in range(c1 + 1, n_clones):
             unique_pair_states = [
@@ -1331,7 +1341,7 @@ def similarity_components_rdrbaf_neymanpearson(
                     (reshaped_pred_cnv[c1, :] == p[0])
                     & (reshaped_pred_cnv[c2, :] == p[1])
                 )[0]
-                
+
                 if "m" in params and "p" in params:
                     t_neymanpearson = eval_neymanpearson_rdrbaf(
                         log_emission_rdr[:, :, c1],
@@ -1359,9 +1369,9 @@ def similarity_components_rdrbaf_neymanpearson(
 
                 # TODO
                 logger.info(f"{c1}, {c2}, {p}, {len(bidx)}, {t_neymanpearson}")
-                
+
                 all_test_statistics.append([c1, c2, p, t_neymanpearson])
-                
+
                 if len(bidx) >= minlength:
                     list_t_neymanpearson.append(t_neymanpearson)
             if (
@@ -1376,9 +1386,9 @@ def similarity_components_rdrbaf_neymanpearson(
                 G.add_weighted_edges_from([(c1, c2, max_v)])
 
     logger.info("Computing Maximal cliques.")
-                
+
     cliques = []
-    
+
     for x in nx.find_cliques(G):
         this_len = len(x)
         this_weights = (
@@ -1386,31 +1396,31 @@ def similarity_components_rdrbaf_neymanpearson(
             / 2
         )
         cliques.append((x, this_len, this_weights))
-        
+
     cliques.sort(key=lambda x: (-x[1], x[2]))
-    
+
     covered_nodes = set()
     merging_groups = []
-    
+
     for c in cliques:
         if len(set(c[0]) & covered_nodes) == 0:
             merging_groups.append(list(c[0]))
             covered_nodes = covered_nodes | set(c[0])
-            
+
     for c in range(n_clones):
         if not (c in covered_nodes):
             merging_groups.append([c])
             covered_nodes.add(c)
-            
+
     merging_groups.sort(key=lambda x: np.min(x))
-    
+
     # NB clone assignment after merging
     map_clone_id = {}
-    
+
     for i, x in enumerate(merging_groups):
         for z in x:
             map_clone_id[z] = i
-            
+
     new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
     merged_res = copy.copy(res)
     merged_res["new_assignment"] = new_assignment
@@ -1429,7 +1439,7 @@ def similarity_components_rdrbaf_neymanpearson(
     )
 
     logger.info("Computed similarity_components_rdrbaf_neymanpearson.")
-    
+
     return merging_groups, merged_res
 
 
@@ -1449,11 +1459,11 @@ def combine_similar_states_across_clones(
     n_states = res["new_p_binom"].shape[0]
     reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1))
     reshaped_pred_cnv = reshaped_pred % n_states
-    
+
     all_test_statistics = compute_neymanpearson_stats(
         X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass
     )
-    
+
     # NB make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small
     for c1 in range(n_clones):
         for c2 in range(c1 + 1, n_clones):
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index e2e4f8e..21ab295 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -907,7 +907,7 @@ def update_emission_params_bb_sitewise_uniqvalues(
     n_states = int(log_gamma.shape[0] / 2)
     gamma = np.exp(log_gamma)
 
-    logger.info("Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
+    logger.info(f"Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
     
     new_p_binom = (
         copy.copy(start_p_binom)
@@ -1119,7 +1119,7 @@ def update_emission_params_bb_sitewise_uniqvalues(
     new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob
     new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob
 
-    logger.info("Computed emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
+    logger.info(f"Computed emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.")
 
     return new_p_binom, new_taus
 

From 94c5a1575cfbd477452f41b720249b892bbee623 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Fri, 16 Aug 2024 20:32:02 -0400
Subject: [PATCH 063/125] fix

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index aaae1d7..2a718cd 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -70,15 +70,15 @@ def compute_emission_probability_nb_betabinom(
         log_emission : array, shape (2*n_states, n_obs, n_spots)
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
-        logger.info(
-            "Computing emission probability for negative binomial & beta binomial (sitewise)."
-        )
-
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         n_states = log_mu.shape[0]
-        # initialize log_emission
+
+        logger.info(
+            "Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}."
+        )
+
         log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots))
         log_emission_baf = np.zeros((2 * n_states, n_obs, n_spots))
         for i in np.arange(n_states):

From 776eeb0027975d6387d6746ec05535455d91648a Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 15:08:25 -0400
Subject: [PATCH 064/125] logging fixes

---
 src/calicost/utils_distribution_fitting.py |  2 +-
 src/calicost/utils_hmm.py                  | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index b343868..61d2835 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -261,7 +261,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
 
-        logger.info(f"Starting Weighted_BetaBinom_mix optimization with @ {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinom_mix optimization @ {start_params}.")
 
         start = time.time()
 
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 21ab295..427a777 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -629,7 +629,7 @@ def update_emission_params_nb_sitewise_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -858,7 +858,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1105,7 +1105,7 @@ def update_emission_params_bb_sitewise_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1380,7 +1380,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1868,7 +1868,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -2073,7 +2073,7 @@ def update_emission_params_bb_nophasing_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -2307,7 +2307,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):

From 4d5466a3ff34f1f253f4b334c0a1250d6220a4da Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 15:20:27 -0400
Subject: [PATCH 065/125] fix

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 2 +-
 src/calicost/utils_hmm.py             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 2a718cd..6e263f9 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -76,7 +76,7 @@ def compute_emission_probability_nb_betabinom(
         n_states = log_mu.shape[0]
 
         logger.info(
-            "Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}."
+            f"Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}."
         )
 
         log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots))
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 427a777..54fe441 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -1686,7 +1686,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
-    logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
+    logger.info(f"Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
 
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
@@ -2117,7 +2117,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
+    logger.info(f"Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
 
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]

From 11f7c93833b11796290370a826b7f12454d1feb1 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 15:32:08 -0400
Subject: [PATCH 066/125] add Baum-Welch log lines.

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 6e263f9..700c7c0 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -428,7 +428,8 @@ def run_baum_welch_nb_bb(
             X[:, 1, :], total_bb_RD
         )
 
-        for r in trange(max_iter, desc="EM algorithm (sitewise)"):
+        for r in range(max_iter):
+            logger.info("-" * 250)
             logger.info(
                 f"Calculating E-step (sitewise) for iteration {r} of {max_iter}."
             )
@@ -579,6 +580,8 @@ def run_baum_welch_nb_bb(
 
         logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
         logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
+
+        logger.info("-" * 250)
         
         return (
             new_log_mu,

From 9d709017a72be464e7a97d20131c18cdc2be9c6f Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 15:44:52 -0400
Subject: [PATCH 067/125] fix

---
 src/calicost/utils_hmm.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 54fe441..2b5c4c0 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -629,7 +629,7 @@ def update_emission_params_nb_sitewise_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Negative Binomial with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -858,7 +858,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Negative Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1105,7 +1105,7 @@ def update_emission_params_bb_sitewise_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1380,7 +1380,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1868,7 +1868,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Negative Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -2073,7 +2073,7 @@ def update_emission_params_bb_nophasing_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -2307,7 +2307,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):

From c91591d9955d0789fb6ec2f11a16487b1371e6d6 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 17:14:57 -0400
Subject: [PATCH 068/125] fix logging.

---
 src/calicost/utils_distribution_fitting.py | 43 +++++++++++++++++-----
 src/calicost/utils_hmm.py                  | 34 +----------------
 2 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 61d2835..1ca6d9a 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -82,10 +82,15 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
+                start_params_str = "existing"
+                
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
+                start_params_str = "default"
+        else:
+            start_params_str = "input"
 
-        logger.info(f"Starting Weighted_NegativeBinomial optimization @ {start_params}.")
+        logger.info(f"Starting Weighted_NegativeBinomial optimization @ ({start_params_str}) {start_params}.")
 
         start = time.time()
 
@@ -136,10 +141,14 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
+                start_params_str = "existing"
             else:
                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
-
-        logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ {start_params}.")
+                start_params_str = "default"
+        else:
+            start_params_str = "input"
+                
+        logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ ({start_params_str}) {start_params}.")
 
         start = time.time()
 
@@ -201,12 +210,16 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
+                start_params_str = "existing"
             else:
                 start_params = np.append(
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
+                start_params_str = "default"
+        else:
+            start_params_str = "input"
 
-        logger.info(f"Starting Weighted_BetaBinomial optimization @ {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinomial optimization @ ({start_params_str}) {start_params}.")
 
         start = time.time()
 
@@ -256,12 +269,16 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
+                start_params_str = "existing"
             else:
                 start_params = np.append(
                     0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
                 )
+                start_params_str = "default"
+        else:
+            start_params_str = "input"
 
-        logger.info(f"Starting Weighted_BetaBinom_mix optimization @ {start_params}.")
+        logger.info(f"Starting Weighted_BetaBinom_mix optimization @ ({start_params_str}) {start_params}.")
 
         start = time.time()
 
@@ -304,10 +321,14 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
+                start_params_str = "existing"
             else:
                 start_params = 0.1 * np.ones(self.nparams)
-
-        logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ {start_params}.")
+                start_params_str = "default"
+        else:
+            start_params_str = "input"
+                
+        logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ ({start_params_str}) {start_params}.")
 
         start = time.time()
 
@@ -356,10 +377,14 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
+                start_params_str = "existing"
             else:
                 start_params = 0.1 * np.ones(self.nparams)
-
-        logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ {start_params}.")
+                start_params_str = "default"
+        else:
+            start_params_str = "input"
+                
+        logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ ({start_params_str}) {start_params}.")
 
         start = time.time()
 
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 2b5c4c0..b9ccb84 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -596,8 +596,6 @@ def update_emission_params_nb_sitewise_uniqvalues(
             model = Weighted_NegativeBinomial(
                 y, features, weights=weights, exposure=exposure
             )
-
-            logger.info("Applying fit with default start params.")
             
             res = model.fit(disp=0, maxiter=1_500, xtol=1.e-4, ftol=1.e-4)
             
@@ -610,8 +608,6 @@ def update_emission_params_nb_sitewise_uniqvalues(
                 new_alphas[:, :] = res.params[-1]
                 
             if not (start_log_mu is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -825,8 +821,6 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                 tumor_prop=tp,
                 penalty=0,
             )
-
-            logger.info("Applying fit with default start params.")
             
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
 
@@ -839,8 +833,6 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(
                 new_alphas[:, :] = res.params[-1]
                 
             if not (start_log_mu is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1072,8 +1064,6 @@ def update_emission_params_bb_sitewise_uniqvalues(
             features = scipy.linalg.block_diag(*features)
             
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
-
-            logger.info("Applying fit with default start params.")
             
             res = model.fit(disp=0, maxiter=1_500, xtol=1e-4, ftol=1e-4)
             
@@ -1086,8 +1076,6 @@ def update_emission_params_bb_sitewise_uniqvalues(
                 new_taus[:, :] = res.params[-1]
                 
             if not (start_p_binom is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1347,8 +1335,6 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
             model = Weighted_BetaBinom_mix(
                 y, features, weights=weights, exposure=exposure, tumor_prop=tp
             )
-
-            logger.info("Applying fit with default start params.")
             
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
             
@@ -1360,9 +1346,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
             if res.params[-1] > 0:
                 new_taus[:, :] = res.params[-1]
                 
-            if not (start_p_binom is None):
-                logger.info("Applying fit with custom start params.")
-                
+            if not (start_p_binom is None):                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1612,8 +1596,6 @@ def update_emission_params_nb_nophasing_uniqvalues(
             model = Weighted_NegativeBinomial(
                 y, features, weights=weights, exposure=exposure
             )
-
-            logger.info("Applying fit with default start params.")
             
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
             
@@ -1626,8 +1608,6 @@ def update_emission_params_nb_nophasing_uniqvalues(
                 new_alphas[:, :] = res.params[-1]
                 
             if not (start_log_mu is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -1835,8 +1815,6 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 tumor_prop=tp,
                 penalty=0,
             )
-
-            logger.info("Applying fit with default start params.")
             
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
             
@@ -1849,8 +1827,6 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 new_alphas[:, :] = res.params[-1]
                 
             if not (start_log_mu is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -2041,8 +2017,6 @@ def update_emission_params_bb_nophasing_uniqvalues(
 
             model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure)
 
-            logger.info("Applying fit with default start params.")
-            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
             
             for s, idx_state_posweight in enumerate(state_posweights):
@@ -2054,8 +2028,6 @@ def update_emission_params_bb_nophasing_uniqvalues(
                 new_taus[:, :] = res.params[-1]
 
             if not (start_p_binom is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,
@@ -2275,8 +2247,6 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                 y, features, weights=weights, exposure=exposure, tumor_prop=tp
             )
 
-            logger.info("Applying fit with default start params.")
-            
             res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4)
             
             for s, idx_state_posweight in enumerate(state_posweights):
@@ -2288,8 +2258,6 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                 new_taus[:, :] = res.params[-1]
                 
             if not (start_p_binom is None):
-                logger.info("Applying fit with custom start params.")
-                
                 res2 = model.fit(
                     disp=0,
                     maxiter=1500,

From 2003e04b856d60337e8948aae39974fdcc9f1bd8 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 17:19:24 -0400
Subject: [PATCH 069/125] log bandwidth

---
 src/calicost/utils_hmrf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py
index 13c6830..0619ceb 100644
--- a/src/calicost/utils_hmrf.py
+++ b/src/calicost/utils_hmrf.py
@@ -104,9 +104,11 @@ def choose_adjacency_by_readcounts(
         adjacency_mat.setdiag(1)
         adjacency_mat = adjacency_mat - smooth_mat
         adjacency_mat[adjacency_mat < 0] = 0
+        
         if np.median(np.sum(adjacency_mat, axis=0).A.flatten()) >= 6:
-            print(f"bandwidth: {bandwidth}")
+            logger.info(f"Bandwidth={bandwidth}")
             break
+        
     return smooth_mat, adjacency_mat
 
 

From 27603ecf64bb10e8cf246ef6f05974b95bf578ca Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 17:25:05 -0400
Subject: [PATCH 070/125] fix logging

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +-
 src/calicost/hmrf.py                   | 2 +-
 src/calicost/parse_input.py            | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index caa94d0..701d63f 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -362,7 +362,7 @@ def run_baum_welch_nb_bb(
 
         use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) 
         
-        logger.info("Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).")
+        logger.info(f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).")
         
         # NB initialize start probability and emission probability
         log_startprob = np.log(np.ones(n_states) / n_states)
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 6ff29d6..fbeac27 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -2117,7 +2117,7 @@ def hmrfmix_concatenate_pipeline(
                 )
             elif nodepotential == "weighted_sum":
                 logger.info(
-                    "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate."
+                    f"Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate."
                 )
 
                 new_assignment, single_llf, total_llf = (
diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py
index 49221c5..f723e4a 100644
--- a/src/calicost/parse_input.py
+++ b/src/calicost/parse_input.py
@@ -264,7 +264,8 @@ def parse_visium(config):
         construct_adjacency_w=config["construct_adjacency_w"],
     )
     n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten())
-    print(f"Set up number of spots to pool in HMRF: {n_pooled}")
+    
+    logger.info(f"Set up number of spots to pool in HMRF: {n_pooled}")
 
     # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data)
     # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization

From fafef1faa69e92e6bf35b4308ff793a593f051d2 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 18:43:25 -0400
Subject: [PATCH 071/125] fix

---
 src/calicost/hmm_NB_BB_nophasing.py    | 2 +-
 src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +-
 src/calicost/hmm_NB_BB_phaseswitch.py  | 1 -
 src/calicost/utils_hmm.py              | 1 -
 src/calicost/utils_phase_switch.py     | 1 -
 5 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index b546989..f3e3251 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -363,7 +363,7 @@ def run_baum_welch_nb_bb(
             X[:, 1, :], total_bb_RD
         )
 
-        for r in trange(max_iter, desc="EM algorithm"):
+        for r in range(max_iter):
             logger.info(f"Calculating E-step for iteration {r} of {max_iter}.")
 
             if tumor_prop is None:
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 701d63f..941008e 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -385,7 +385,7 @@ def run_baum_welch_nb_bb(
             X[:, 1, :], total_bb_RD
         )
 
-        for r in trange(max_iter, desc="EM algorithm"):
+        for r in range(max_iter):
             logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.")
 
             if tumor_prop is None:
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 700c7c0..7492c47 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -6,7 +6,6 @@
 from scipy.optimize import minimize
 from scipy.optimize import Bounds
 from sklearn.mixture import GaussianMixture
-from tqdm import trange
 import statsmodels.api as sm
 from statsmodels.base.model import GenericLikelihoodModel
 import copy
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index b9ccb84..8eb96c0 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -3,7 +3,6 @@
 import copy
 import scipy.special
 from numba import njit
-from tqdm import trange
 from sklearn.mixture import GaussianMixture
 from calicost.utils_distribution_fitting import *
 
diff --git a/src/calicost/utils_phase_switch.py b/src/calicost/utils_phase_switch.py
index 2b30fa3..3b1007d 100644
--- a/src/calicost/utils_phase_switch.py
+++ b/src/calicost/utils_phase_switch.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 from pathlib import Path
-from tqdm import trange
 import scipy
 import scipy.special
 

From ee1810e1965320f05b1fbfa976e372028c9cb1a8 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 18:46:31 -0400
Subject: [PATCH 072/125] fix logging

---
 src/calicost/hmm_NB_BB_nophasing.py    | 2 +-
 src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +-
 src/calicost/hmm_NB_BB_phaseswitch.py  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index f3e3251..0a436b9 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -507,7 +507,7 @@ def run_baum_welch_nb_bb(
             p_binom = new_p_binom
             taus = new_taus
 
-        logger.info("Computed Baum-Welch (v2).")
+        logger.info("Computed Baum-Welch (v2) in {r+1} iterations.")
 
         return (
             new_log_mu,
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 941008e..a360736 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -590,7 +590,7 @@ def run_baum_welch_nb_bb(
             p_binom = new_p_binom
             taus = new_taus
 
-        logger.info("Computed Baum-Welch (v2).")
+        logger.info("Computed Baum-Welch (v2) in {r+1} iterations.")
 
         logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
         logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 7492c47..b0b741c 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -575,7 +575,7 @@ def run_baum_welch_nb_bb(
             p_binom = new_p_binom
             taus = new_taus
 
-        logger.info("Computed Baum-Welch (sitewise).")
+        logger.info("Computed Baum-Welch (sitewise) in {r+1} iterations.")
 
         logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
         logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
@@ -1487,7 +1487,7 @@ def combine_similar_states_across_clones(
                     ] = res["pred_cnv"][(c_keep * n_obs) : (c_keep * n_obs + n_obs)][
                         bidx
                     ]
-                    print(
+                    logger.info(
                         f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}"
                     )
     return res

From 03db264a41d1cae2b09df67c921c6c25270ff80c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 18:51:04 -0400
Subject: [PATCH 073/125] fix logging bug

---
 src/calicost/utils_hmm.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index 8eb96c0..e953f90 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -1467,13 +1467,12 @@ def update_emission_params_nb_nophasing_uniqvalues(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
-
-    logger.info("Computing emission params for Negative Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.")
-
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
 
+    logger.info("Computing emission params for Negative Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.")
+    
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None
@@ -1665,12 +1664,12 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
     base_nb_mean : array, shape (n_observations, n_spots)
         Mean expression under diploid state.
     """
-    logger.info(f"Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
-
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
 
+    logger.info(f"Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
+    
     new_log_mu = (
         copy.copy(start_log_mu)
         if not start_log_mu is None
@@ -1886,11 +1885,11 @@ def update_emission_params_bb_nophasing_uniqvalues(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info("Computing emission params for Beta Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.")
-
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
+
+    logger.info("Computing emission params for Beta Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.")
     
     # NB initialization
     new_p_binom = (
@@ -2088,11 +2087,11 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
     total_bb_RD : array, shape (n_observations, n_spots)
         SNP-covering reads for both REF and ALT across genes along genome.
     """
-    logger.info(f"Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
-
     n_spots = len(unique_values)
     n_states = log_gamma.shape[0]
     gamma = np.exp(log_gamma)
+
+    logger.info(f"Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.")
     
     # NB initialization
     new_p_binom = (

From cef4eb2e65163c5038ab5de20081fdc9a7f48f76 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 18:59:07 -0400
Subject: [PATCH 074/125] add TODO for gammas in phasing beta binomial

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index b0b741c..b4474b1 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -80,6 +80,7 @@ def compute_emission_probability_nb_betabinom(
 
         log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots))
         log_emission_baf = np.zeros((2 * n_states, n_obs, n_spots))
+        
         for i in np.arange(n_states):
             for s in np.arange(n_spots):
                 # expression from NB distribution
@@ -94,22 +95,28 @@ def compute_emission_probability_nb_betabinom(
                     log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = (
                         log_emission_rdr[i, idx_nonzero_rdr, s]
                     )
+                    
                 # AF from BetaBinom distribution
                 idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0]
+                
                 if len(idx_nonzero_baf) > 0:
                     log_emission_baf[i, idx_nonzero_baf, s] = (
                         scipy.stats.betabinom.logpmf(
                             X[idx_nonzero_baf, 1, s],
                             total_bb_RD[idx_nonzero_baf, s],
                             p_binom[i, s] * taus[i, s],
-                            (1 - p_binom[i, s]) * taus[i, s],
+                            (1. - p_binom[i, s]) * taus[i, s],
                         )
                     )
+
+                    # TODO 
+                    # log_emission_baf[i, idx_nonzero_baf, s] - scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) - scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s])
+                    
                     log_emission_baf[i + n_states, idx_nonzero_baf, s] = (
                         scipy.stats.betabinom.logpmf(
                             X[idx_nonzero_baf, 1, s],
                             total_bb_RD[idx_nonzero_baf, s],
-                            (1 - p_binom[i, s]) * taus[i, s],
+                            (1. - p_binom[i, s]) * taus[i, s],
                             p_binom[i, s] * taus[i, s],
                         )
                     )

From fbbb9b5bb4dfe9bcd969cb51f31b7d056d611652 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 19:01:31 -0400
Subject: [PATCH 075/125] fix

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index b4474b1..0d08274 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -75,7 +75,7 @@ def compute_emission_probability_nb_betabinom(
         n_states = log_mu.shape[0]
 
         logger.info(
-            f"Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}."
+            f"Computing emission probability for negative binomial & beta binomial (sitewise, phaseswitch) with n_spots and n_states = {n_spots} and {n_states}."
         )
 
         log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots))
@@ -110,7 +110,8 @@ def compute_emission_probability_nb_betabinom(
                     )
 
                     # TODO 
-                    # log_emission_baf[i, idx_nonzero_baf, s] - scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) - scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s])
+                    # log_emission_baf[i, idx_nonzero_baf, s] - scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) - scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s])
+                    #               + scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s])
                     
                     log_emission_baf[i + n_states, idx_nonzero_baf, s] = (
                         scipy.stats.betabinom.logpmf(
@@ -122,7 +123,7 @@ def compute_emission_probability_nb_betabinom(
                     )
 
         logger.info(
-            "Computed emission probability for negative binomial & beta binomial (sitewise)."
+            "Computed emission probability for negative binomial & beta binomial (sitewise, phaseswitch)."
         )
 
         return log_emission_rdr, log_emission_baf
@@ -169,7 +170,7 @@ def compute_emission_probability_nb_betabinom_mix(
             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
         """
         logger.info(
-            "Computing emission probability for *mixed* negative binomial & beta binomial (sitewise)."
+            "Computing emission probability for *mixed* negative binomial & beta binomial (sitewise, phaseswitch)."
         )
 
         n_obs = X.shape[0]
@@ -224,7 +225,7 @@ def compute_emission_probability_nb_betabinom_mix(
                     )
 
         logger.info(
-            "Computed emission probability for *mixed* negative binomial & beta binomial (sitewise)."
+            "Computed emission probability for *mixed* negative binomial & beta binomial (sitewise, phaseswitch)."
         )
 
         return log_emission_rdr, log_emission_baf

From 3409014320d83f97e2388552418cb84e40d2c09b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 19:04:33 -0400
Subject: [PATCH 076/125] fix

---
 src/calicost/calicost_main.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 12be38d..0eb2738 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -137,6 +137,8 @@ def main(configuration_file):
             for c, idx in enumerate(initial_clone_index):
                 initial_assignment[idx] = c
 
+            logger.info(f"Writing initial assignment to {file_path}")
+                
             np.savez(
                 str(file_path),
                 **{"num_iterations": 0, "round-1_assignment": initial_assignment},
@@ -464,7 +466,7 @@ def main(configuration_file):
             single_base_nb_mean = copy_single_base_nb_mean
             n_obs = single_X.shape[0]
 
-            logger.info(f"Writing {outdir}/binned_data.npz")
+            logger.info(f"Writing lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, single_tumor_prop to {outdir}/binned_data.npz")
 
             np.savez(
                 f"{outdir}/binned_data.npz",
@@ -519,6 +521,8 @@ def main(configuration_file):
                     for c, idx in enumerate(initial_clone_index):
                         initial_assignment[idx] = c
 
+                    logger.info(f"Writing initial assignment to {file_path}")
+                        
                     np.savez(
                         str(file_path),
                         **{
@@ -987,6 +991,7 @@ def main(configuration_file):
                             return_posterior=True,
                         )
                     )
+                    
             res_combine["total_llf"] = total_llf
             res_combine["new_assignment"] = new_assignment
 
@@ -996,7 +1001,7 @@ def main(configuration_file):
             )
 
             logger.info(
-                f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz"
+                f"Writing likelihood and new clone assignment to {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz"
             )
 
             np.savez(
@@ -1004,7 +1009,7 @@ def main(configuration_file):
                 **res_combine,
             )
 
-            logger.info(f"Writing {outdir}/posterior_clone_probability.npy")
+            logger.info(f"Writing posterior to {outdir}/posterior_clone_probability.npy")
 
             np.save(f"{outdir}/posterior_clone_probability.npy", posterior)
 

From f3c06bc10ad4a60361d0f43265737d966a543f44 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 19:08:13 -0400
Subject: [PATCH 077/125] fix

---
 src/calicost/hmm_NB_BB_phaseswitch.py |  6 ++++--
 src/calicost/hmrf.py                  | 14 ++++++++------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 0d08274..0aceb3f 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -1008,9 +1008,11 @@ def pipeline_baum_welch(
     pred = np.argmax(log_gamma, axis=0)
     pred_cnv = pred % n_states
 
-    # save results
     if not output_prefix is None:
-        tmp = np.log10(1 - t)
+        tmp = np.log10(1. - t)
+
+        logger.info(f"Writing new parameters to {output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz")
+        
         np.savez(
             f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz",
             new_log_mu=new_log_mu,
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index fbeac27..02a0147 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -785,7 +785,7 @@ def hmrf_pipeline(
             res["total_llf"] = total_llf
 
             logger.info(
-                f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz"
+                f"Writing likelihood, previous and new assignment to HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz"
             )
 
             np.savez(f"{outdir}/round{r}_nstates{n_states}_{params}.npz", **res)
@@ -1071,7 +1071,6 @@ def hmrf_concatenate_pipeline(
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
 
-            # append to allres
             for k, v in res.items():
                 if k == "prev_assignment":
                     allres[f"round{r-1}_assignment"] = v
@@ -1083,7 +1082,7 @@ def hmrf_concatenate_pipeline(
             allres["num_iterations"] = r + 1
 
             logger.info(
-                f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
+                f"Writing assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
             )
 
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
@@ -1094,6 +1093,7 @@ def hmrf_concatenate_pipeline(
             np.where(res["new_assignment"] == c)[0]
             for c in np.sort(np.unique(res["new_assignment"]))
         ]
+        
         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
             single_X, single_base_nb_mean, single_total_bb_RD, clone_index
         )
@@ -1588,12 +1588,11 @@ def hmrfmix_pipeline(
                 remaining_clones = np.sort(np.unique(new_assignment))
                 re_indexing = {c: i for i, c in enumerate(remaining_clones)}
                 new_assignment = np.array([re_indexing[x] for x in new_assignment])
-            #
+            
             res["prev_assignment"] = last_assignment
             res["new_assignment"] = new_assignment
             res["total_llf"] = total_llf
 
-            # append to allres
             for k, v in res.items():
                 if k == "prev_assignment":
                     allres[f"round{r-1}_assignment"] = v
@@ -1602,6 +1601,9 @@ def hmrfmix_pipeline(
                 else:
                     allres[f"round{r}_{k}"] = v
             allres["num_iterations"] = r + 1
+
+            logger.info("Writing assignments to {outdir}/{prefix}_nstates{n_states}_{params}.npz")
+            
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
 
         logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}")
@@ -2168,7 +2170,7 @@ def hmrfmix_concatenate_pipeline(
             allres["num_iterations"] = r + 1
 
             logger.info(
-                f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
+                f"Writing assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
             )
 
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)

From 48a34a4193877313f881e8f07acd18c29ae5d185 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 19:15:24 -0400
Subject: [PATCH 078/125] fix

---
 src/calicost/hmrf.py | 45 ++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 02a0147..d0e5c94 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -64,7 +64,7 @@ def hmrf_reassignment_posterior(
         f"Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
     )
 
-    for i in trange(N, desc="hmrf_reassignment_posterior"):
+    for i in range(N):
         idx = smooth_mat[i, :].nonzero()[1]
         for c in range(n_clones):
             tmp_log_emission_rdr, tmp_log_emission_baf = (
@@ -137,7 +137,7 @@ def hmrf_reassignment_posterior(
         )
 
     logger.info(
-        "Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
+        f"Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
@@ -181,7 +181,7 @@ def aggr_hmrf_reassignment(
         "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
-    for i in trange(N, desc="aggr_hmrf_reassignment"):
+    for i in range(N):
         idx = smooth_mat[i, :].nonzero()[1]
 
         for c in range(n_clones):
@@ -239,7 +239,7 @@ def aggr_hmrf_reassignment(
         )
 
     logger.info(
-        "Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
+        f"Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
@@ -280,7 +280,7 @@ def hmrf_reassignment_posterior_concatenate(
         "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
-    for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"):
+    for i in range(N):
         idx = smooth_mat[i, :].nonzero()[1]
         tmp_log_emission_rdr, tmp_log_emission_baf = (
             hmmclass.compute_emission_probability_nb_betabinom(
@@ -355,7 +355,7 @@ def hmrf_reassignment_posterior_concatenate(
         )
 
     logger.info(
-        "Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+        f"Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
@@ -438,9 +438,9 @@ def aggr_hmrf_reassignment_concatenate(
 
     posterior = np.zeros((N, n_clones))
 
-    for i in trange(N, desc="aggr_hmrf_reassignment_concatenate"):
+    for i in range(N):
         idx = smooth_mat[i, :].nonzero()[1]
-        # idx = np.append(idx, np.array([i]))
+        
         tmp_log_emission_rdr, tmp_log_emission_baf = (
             hmmclass.compute_emission_probability_nb_betabinom(
                 np.sum(single_X[:, :, idx], axis=2, keepdims=True),
@@ -496,7 +496,7 @@ def aggr_hmrf_reassignment_concatenate(
         )
 
     logger.info(
-        "Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+        f"Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
     )
 
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
@@ -1160,18 +1160,22 @@ def aggr_hmrfmix_reassignment(
     hmmclass=hmm_sitewise,
     return_posterior=False,
 ):
+    logger.info(
+        f"Computing aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
-    #
+    
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
-    #
+    
     posterior = np.zeros((N, n_clones))
-    #
-    for i in trange(N):
+    
+    for i in range(N):
         idx = smooth_mat[i, :].nonzero()[1]
         idx = idx[~np.isnan(single_tumor_prop[idx])]
         for c in range(n_clones):
@@ -1246,7 +1250,11 @@ def aggr_hmrfmix_reassignment(
         )
 
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
-        
+
+    logger.info(
+	f"Computed aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass}."
+    )
+    
     if return_posterior:
         return new_assignment, single_llf, total_llf, posterior
     else:
@@ -1424,6 +1432,11 @@ def hmrfmix_pipeline(
 ):
     n_obs, _, n_spots = single_X.shape
     n_clones = len(initial_clone_index)
+
+    logger.info(
+	f"Computing hmrfmix_pipeline for (N, n_obs, n_clones) = ({n_spots}, {n_obs}, {n_clones})."
+    )
+    
     # spot adjacency matric
     assert not (coords is None and adjacency_mat is None)
     if adjacency_mat is None:
@@ -1664,6 +1677,10 @@ def hmrfmix_pipeline(
                 :, sidx
             ] - scipy.special.logsumexp(log_persample_weights[:, sidx])
 
+    logger.info(
+	f"Computed hmrfmix_pipeline for (N, n_obs, n_clones) = ({n_spots}, {n_obs}, {n_clones})."
+    )
+
 
 def hmrfmix_reassignment_posterior_concatenate(
     single_X,

From c58ea896f92607b8840a6063e6c3c4f94ed2cc3d Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 19:17:18 -0400
Subject: [PATCH 079/125] add hmrf logging

---
 src/calicost/hmrf.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index d0e5c94..5963aa2 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1845,6 +1845,11 @@ def aggr_hmrfmix_reassignment_concatenate(
     n_obs = single_X.shape[0]
     n_clones = int(len(pred) / n_obs)
     n_states = res["new_p_binom"].shape[0]
+
+    logger.info(
+	f"Computing aggr_hmrfmix_reassignment_concatenate for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
+    
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
     
@@ -1925,6 +1930,10 @@ def aggr_hmrfmix_reassignment_concatenate(
             )
         )
 
+    logger.info(
+	f"Computed aggr_hmrfmix_reassignment_concatenate for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
+        
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
         
     if return_posterior:

From c7d09d9f9820b51e37ccc0475934b795e73744c7 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 19:49:28 -0400
Subject: [PATCH 080/125] add cpas

---
 src/calicost/calicost_main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 0eb2738..4975141 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -333,7 +333,7 @@ def main(configuration_file):
         )
 
         logger.info(
-            "Preparing refinement of initial, merged clones using BAF & RDR  ****"
+            "**** Preparing refinement of initial, merged clones using BAF & RDR  ****"
         )
 
         if not config["bafonly"]:
@@ -481,7 +481,7 @@ def main(configuration_file):
             )
 
             logger.info(
-                f"****  Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR  ****"
+                f"****  REFINING INITIAL, MERGED CLONES (N={n_baf_clones}) USING BAF & RDR  ****"
             )
 
             for bafc in range(n_baf_clones):

From fbfe56b1df470eff18b643952498f48e68efb53c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 20:05:00 -0400
Subject: [PATCH 081/125] log phasing baum welch.

---
 src/calicost/parse_input.py | 24 ++++++++----------------
 src/calicost/phasing.py     | 20 +++++++++++++++++++-
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py
index f723e4a..16af14a 100644
--- a/src/calicost/parse_input.py
+++ b/src/calicost/parse_input.py
@@ -160,6 +160,7 @@ def parse_visium(config):
         logphase_shift=config["logphase_shift"],
         geneticmap_file=config["geneticmap_file"],
     )
+    
     # infer an initial phase using pseudobulk
     if not Path(f"{config['output_dir']}/initial_phase.npz").exists():
         initial_clone_for_phasing = perform_partition(
@@ -170,6 +171,7 @@ def parse_visium(config):
             single_tumor_prop=single_tumor_prop,
             threshold=config["tumorprop_threshold"],
         )
+        
         phase_indicator, refined_lengths = initial_phase_given_partition(
             single_X,
             lengths,
@@ -190,11 +192,15 @@ def parse_visium(config):
             1e-3,
             threshold=config["tumorprop_threshold"],
         )
+
+        logger.info(f"Writing initial pase to {config['output_dir']}/initial_phase.npz")
+        
         np.savez(
             f"{config['output_dir']}/initial_phase.npz",
             phase_indicator=phase_indicator,
             refined_lengths=refined_lengths,
         )
+        
         # map phase indicator to individual snps
         df_gene_snp["phase"] = np.where(
             df_gene_snp.snp_id.isnull(),
@@ -228,21 +234,6 @@ def parse_visium(config):
         logphase_shift=config["logphase_shift"],
         geneticmap_file=config["geneticmap_file"],
     )
-    # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \
-    #     single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi)
-
-    # # remove bins where normal spots have imbalanced SNPs
-    # if not config["tumorprop_file"] is None:
-    #     for prop_threshold in np.arange(0, 0.6, 0.05):
-    #         normal_candidate = (single_tumor_prop <= prop_threshold)
-    #         if np.sum(single_X[:, 0, (normal_candidate==True)]) > single_X.shape[0] * 200:
-    #             break
-    #     index_normal = np.where(normal_candidate)[0]
-    #     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
-    #             single_X, single_base_nb_mean, single_total_bb_RD, config["nu"], config["logphase_shift"], index_normal, config['geneticmap_file'])
-    #     assert np.sum(lengths) == single_X.shape[0]
-    #     assert single_X.shape[0] == single_total_bb_RD.shape[0]
-    #     assert single_X.shape[0] == len(log_sitewise_transmat)
 
     # expression count dataframe
     exp_counts = pd.DataFrame.sparse.from_spmatrix(
@@ -263,9 +254,10 @@ def parse_visium(config):
         maxspots_pooling=config["maxspots_pooling"],
         construct_adjacency_w=config["construct_adjacency_w"],
     )
+    
     n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten())
     
-    logger.info(f"Set up number of spots to pool in HMRF: {n_pooled}")
+    logger.info(f"Set up number of spots to pool for HMRF: {n_pooled}")
 
     # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data)
     # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization
diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py
index e4c9447..385a8be 100644
--- a/src/calicost/phasing.py
+++ b/src/calicost/phasing.py
@@ -102,11 +102,19 @@ def initial_phase_given_partition(
     threshold,
     min_snpumi=2e3,
 ):
+
+    n_obs, _, n_spots = single_X.shape
+    
+    logger.info(f"Computing initial_phase_given_partition for (n_states, n_obs, n_spots) = ({n_states}, {n_obs}, {n_spots}).")
+    
+    # TODO HARDCODE
     EPS_BAF = 0.05
+    
     if single_tumor_prop is None:
         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
             single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
         )
+        
         tumor_prop = None
     else:
         X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
@@ -121,6 +129,7 @@ def initial_phase_given_partition(
     # pseudobulk HMM for phase_prob
     baf_profiles = np.zeros((X.shape[2], X.shape[0]))
     pred_cnv = np.zeros((X.shape[2], X.shape[0]))
+    
     for i in range(X.shape[2]):
         if np.sum(total_bb_RD[:, i]) < min_snpumi:
             baf_profiles[i, :] = 0.5
@@ -150,7 +159,7 @@ def initial_phase_given_partition(
                 max_iter=max_iter,
                 tol=tol,
             )
-            #
+            
             pred = np.argmax(res["log_gamma"], axis=0)
             this_baf_profiles = np.where(
                 pred < n_states,
@@ -180,10 +189,12 @@ def initial_phase_given_partition(
             )
             @ baf_profiles
         )
+        
     adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1 - baf_profiles)
     phase_indicator = population_baf < 0.5
     refined_lengths = []
     cumlen = 0
+    
     for le in lengths:
         s = 0
         for i in range(le):
@@ -199,14 +210,19 @@ def initial_phase_given_partition(
         refined_lengths.append(le - s)
         cumlen += le
     refined_lengths = np.array(refined_lengths)
+
+    logger.info(f"Computed initial_phase_given_partition.")
+    
     return phase_indicator, refined_lengths
 
 
 def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, threshold):
     initial_clone_index = []
+    
     for s in range(np.max(sample_ids) + 1):
         index = np.where(sample_ids == s)[0]
         assert len(index) > 0
+        
         if single_tumor_prop is None:
             tmp_clone_index = fixed_rectangle_initialization(
                 coords[index, :], x_part, y_part
@@ -219,6 +235,8 @@ def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, thr
                 single_tumor_prop[index],
                 threshold=threshold,
             )
+            
         for x in tmp_clone_index:
             initial_clone_index.append(index[x])
+            
     return initial_clone_index

From 8969c351dfe2bec34566735b12ed31daadb7bfd0 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 20:13:51 -0400
Subject: [PATCH 082/125] fix

---
 src/calicost/calicost_main.py | 2 +-
 src/calicost/phasing.py       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 4975141..6f7473d 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -84,7 +84,7 @@ def main(configuration_file):
         exp_counts,
     ) = run_parse_n_load(config)
 
-    logger.info(f"****  Estimating initial clones using BAF only  ****")
+    logger.info(f"****  ESTIMATING INITIAL CLONES USING BAF ONLY  ****")
 
     # NB setting transcript & baseline count to 0 so the emission probability will be ignored.
     copy_single_X_rdr = copy.copy(single_X[:, 0, :])
diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py
index 385a8be..745f4eb 100644
--- a/src/calicost/phasing.py
+++ b/src/calicost/phasing.py
@@ -104,7 +104,8 @@ def initial_phase_given_partition(
 ):
 
     n_obs, _, n_spots = single_X.shape
-    
+
+    logger.info(f"****  COMPUTING INITIAL PHASE  ****")
     logger.info(f"Computing initial_phase_given_partition for (n_states, n_obs, n_spots) = ({n_states}, {n_obs}, {n_spots}).")
     
     # TODO HARDCODE

From 726bc11b118e4960fa03aa4d612f912c8cabe27c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 20:39:41 -0400
Subject: [PATCH 083/125] log initial alpha and tau

---
 src/calicost/hmm_NB_BB_nophasing_v2.py |  3 +++
 src/calicost/hmm_NB_BB_phaseswitch.py  | 12 ++++++++----
 src/calicost/utils_IO.py               |  2 +-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index a360736..e851014 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -361,6 +361,9 @@ def run_baum_welch_nb_bb(
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
 
         use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) 
+
+        logger.info(f"Initial alphas:\n{alphas}")
+        logger.info(f"Initial taus:\n{taus}")
         
         logger.info(f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).")
         
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 0aceb3f..0293cf5 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -394,12 +394,9 @@ def run_baum_welch_nb_bb(
         n_obs = X.shape[0]
         n_comp = X.shape[1]
         n_spots = X.shape[2]
+        
         assert n_comp == 2
 
-        logger.info(
-            "Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise)."
-        )
-
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
             if init_log_mu is None
@@ -417,6 +414,13 @@ def run_baum_welch_nb_bb(
         )
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
 
+        logger.info(f"Initial alphas:\n{alphas}")
+        logger.info(f"Initial taus:\n{taus}")
+
+        logger.info(
+            "Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise)."
+        )
+        
         # NB initialize start probability and emission probability
         log_startprob = np.log(np.ones(n_states) / n_states)
         if n_states > 1:
diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index 2f0ee47..6e38ff8 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -1522,7 +1522,7 @@ def bin_selection_basedon_normal(
     min_betabinom_tau=30,
 ):
     """
-    Filter out bins that potential contain somatic mutations based on BAF of normal spots.
+    Filter out bins that potentially contain somatic mutations based on BAF of normal spots.
     """
     # pool B allele counts for each bin across all normal spots
     tmpX = np.sum(single_X[:, 1, index_normal], axis=1)

From 6f85e89e0bee44a088111a9a6cf82a4ba923909d Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 20:53:22 -0400
Subject: [PATCH 084/125] fix

---
 src/calicost/calicost_main.py | 2 +-
 src/calicost/hmrf.py          | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 6f7473d..1ef0c62 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -485,7 +485,7 @@ def main(configuration_file):
             )
 
             for bafc in range(n_baf_clones):
-                logger.info(f"Refining BAF clone {bafc}.")
+                logger.info(f"****  Refining BAF clone {bafc}  ****")
 
                 prefix = f"clone{bafc}"
                 idx_spots = np.where(merged_baf_assignment == bafc)[0]
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 5963aa2..0a5b853 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -1815,8 +1815,9 @@ def hmrfmix_reassignment_posterior_concatenate(
             )
         )
 
+    unique_assignment, cnts = np.unique(new_assignment, return_counts=True)
+        
     logger.info(f"Computed hmrfmix_reassignment_posterior_concatenate.")
-
     logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}")
     
     if return_posterior:

From 160ca9c694976846fe0e39c5c34d841d4f762b6f Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 21:03:12 -0400
Subject: [PATCH 085/125] fix

---
 src/calicost/calicost_main.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py
index 1ef0c62..d1619af 100644
--- a/src/calicost/calicost_main.py
+++ b/src/calicost/calicost_main.py
@@ -596,7 +596,7 @@ def main(configuration_file):
                         tumorprop_threshold=config["tumorprop_threshold"],
                     )
 
-            logger.info(f"Combining results across clones.")
+            logger.info(f"****  REFINED CLONES BY RDR.  COMBINING RESULTS ACROSS CLONES.  ****")
 
             res_combine = {"prev_assignment": np.zeros(single_X.shape[2], dtype=int)}
             offset_clone = 0
@@ -672,7 +672,7 @@ def main(configuration_file):
                         tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1)
 
                     logger.info(
-                        f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio."
+                        f"****  MERGING BAF+RDR CLONES BASED ON NEYMAN-PEARSON LIKELIHOOD RATIO  ****"
                     )
 
                     merging_groups, merged_res = (
@@ -750,7 +750,7 @@ def main(configuration_file):
                         )
 
                     logger.info(
-                        f"Running Baum-Welch with refined & merged BAF+RDR clones."
+                        f"****  EVALUATING BAUM-WELCH WITH REFINED & MERGED BAF+RDR CLONES  ****"
                     )
 
                     merged_res = pipeline_baum_welch(
@@ -1000,6 +1000,10 @@ def main(configuration_file):
                 res_combine, posterior, single_tumor_prop
             )
 
+            logger.info(
+                f"****  EVALUATED BAUM-WELCH WITH REFINED & MERGED BAF+RDR CLONES  ****"
+            )
+            
             logger.info(
                 f"Writing likelihood and new clone assignment to {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz"
             )

From 4a8103f76cd7d78ae465ae869eecce0293cc1b38 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 21:13:04 -0400
Subject: [PATCH 086/125] fix

---
 src/calicost/utils_IO.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index 6e38ff8..801049c 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -114,8 +114,8 @@ def load_data(
     ).A.flatten()
     genenames = set(list(adata.var.index[indicator]))
     adata = adata[:, indicator]
-    print(adata)
-    print(
+    logger.info(adata)
+    logger.info(
         "median UMI after filtering out genes < 0.5% of cells = {}".format(
             np.median(np.sum(adata.layers["count"], axis=1))
         )
@@ -130,7 +130,7 @@ def load_data(
             [(not x in filter_gene_list) for x in adata.var.index]
         )
         adata = adata[:, indicator_filter]
-        print(
+        logger.info(
             "median UMI after filtering out genes in filtergenelist_file = {}".format(
                 np.median(np.sum(adata.layers["count"], axis=1))
             )
@@ -172,13 +172,13 @@ def load_data(
     clf = LocalOutlierFactor(n_neighbors=200)
     label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1))
     adata.layers["count"][:, np.where(label == -1)[0]] = 0
-    print("filter out {} outlier genes.".format(np.sum(label == -1)))
+    logger.info("filter out {} outlier genes.".format(np.sum(label == -1)))
 
     if not normalidx_file is None:
         normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values
         adata.obs["tumor_annotation"] = "tumor"
         adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal"
-        print(adata.obs["tumor_annotation"].value_counts())
+        logger.info(adata.obs["tumor_annotation"].value_counts())
 
     return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids
 
@@ -374,8 +374,8 @@ def load_joint_data(
     ).A.flatten()
     genenames = set(list(adata.var.index[indicator]))
     adata = adata[:, indicator]
-    print(adata)
-    print(
+    logger.info(adata)
+    logger.info(
         "median UMI after filtering out genes < 0.5% of cells = {}".format(
             np.median(np.sum(adata.layers["count"], axis=1))
         )
@@ -388,7 +388,7 @@ def load_joint_data(
             [(not x in filter_gene_list) for x in adata.var.index]
         )
         adata = adata[:, indicator_filter]
-        print(
+        logger.info(
             "median UMI after filtering out genes in filtergenelist_file = {}".format(
                 np.median(np.sum(adata.layers["count"], axis=1))
             )
@@ -430,13 +430,13 @@ def load_joint_data(
     clf = LocalOutlierFactor(n_neighbors=200)
     label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1))
     adata.layers["count"][:, np.where(label == -1)[0]] = 0
-    print("filter out {} outlier genes.".format(np.sum(label == -1)))
+    logger.info("filter out {} outlier genes.".format(np.sum(label == -1)))
 
     if not normalidx_file is None:
         normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values
         adata.obs["tumor_annotation"] = "tumor"
         adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal"
-        print(adata.obs["tumor_annotation"].value_counts())
+        logger.info(adata.obs["tumor_annotation"].value_counts())
 
     return (
         adata,
@@ -548,8 +548,8 @@ def filter_genes_barcodes_hatchetblock(
     ).A.flatten()
     genenames = set(list(adata.var.index[indicator]))
     adata = adata[:, indicator]
-    print(adata)
-    print(
+    logger.info(adata)
+    logger.info(
         "median UMI after filtering out genes < 0.5% of cells = {}".format(
             np.median(np.sum(adata.layers["count"], axis=1))
         )
@@ -562,7 +562,7 @@ def filter_genes_barcodes_hatchetblock(
             [(not x in filter_gene_list) for x in adata.var.index]
         )
         adata = adata[:, indicator_filter]
-        print(
+        logger.info(
             "median UMI after filtering out genes in filtergenelist_file = {}".format(
                 np.median(np.sum(adata.layers["count"], axis=1))
             )
@@ -1680,7 +1680,7 @@ def filter_de_genes(
             )
         )
         filtered_out_set = filtered_out_set | this_filtered_out_set
-        print(f"Filter out {len(filtered_out_set)} DE genes")
+        logger.info(f"Filter out {len(filtered_out_set)} DE genes")
     #
     new_single_X_rdr = np.zeros((len(x_gene_list), adata.shape[0]))
     for i, x in enumerate(x_gene_list):

From 575ca764c0917b1aba4ff415771904b5046343be Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 18 Aug 2024 21:31:54 -0400
Subject: [PATCH 087/125] fix

---
 src/calicost/hmm_NB_BB_nophasing_v2.py |  2 +-
 src/calicost/hmm_NB_BB_phaseswitch.py  |  2 +-
 src/calicost/hmrf.py                   | 25 +++++++++++++------------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index e851014..b3ed727 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -593,7 +593,7 @@ def run_baum_welch_nb_bb(
             p_binom = new_p_binom
             taus = new_taus
 
-        logger.info("Computed Baum-Welch (v2) in {r+1} iterations.")
+        logger.info(f"Computed Baum-Welch (v2) in {r+1} iterations.")
 
         logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
         logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 0293cf5..b32191b 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -587,7 +587,7 @@ def run_baum_welch_nb_bb(
             p_binom = new_p_binom
             taus = new_taus
 
-        logger.info("Computed Baum-Welch (sitewise) in {r+1} iterations.")
+        logger.info(f"Computed Baum-Welch (sitewise) in {r+1} iterations.")
 
         logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
         logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 0a5b853..b6be6ca 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -66,6 +66,7 @@ def hmrf_reassignment_posterior(
 
     for i in range(N):
         idx = smooth_mat[i, :].nonzero()[1]
+        
         for c in range(n_clones):
             tmp_log_emission_rdr, tmp_log_emission_baf = (
                 hmmclass.compute_emission_probability_nb_betabinom(
@@ -119,7 +120,7 @@ def hmrf_reassignment_posterior(
             if new_assignment[j] >= 0:
                 w_edge[new_assignment[j]] += adjacency_mat[i, j]
         new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge)
-        #
+        
         posterior[i, :] = np.exp(
             w_node
             + spatial_weight * w_edge
@@ -128,6 +129,7 @@ def hmrf_reassignment_posterior(
 
     # compute total log likelihood log P(X | Z) + log P(Z)
     total_llf = np.sum(single_llf[np.arange(N), new_assignment])
+    
     for i in range(N):
         total_llf += np.sum(
             spatial_weight
@@ -178,7 +180,7 @@ def aggr_hmrf_reassignment(
     posterior = np.zeros((N, n_clones))
 
     logger.info(
-        "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}."
+        "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
     )
 
     for i in range(N):
@@ -277,7 +279,7 @@ def hmrf_reassignment_posterior_concatenate(
     posterior = np.zeros((N, n_clones))
 
     logger.info(
-        "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
+        "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
     )
 
     for i in range(N):
@@ -424,11 +426,6 @@ def aggr_hmrf_reassignment_concatenate(
     total_llf : float
         The HMRF objective, which is the sum of log likelihood under the optimal labels plus the sum of edge potentials.
     """
-
-    logger.info(
-        "Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}."
-    )
-
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = int(len(pred) / n_obs)
@@ -436,6 +433,10 @@ def aggr_hmrf_reassignment_concatenate(
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
 
+    logger.info(
+        "Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
+    
     posterior = np.zeros((N, n_clones))
 
     for i in range(N):
@@ -1160,16 +1161,16 @@ def aggr_hmrfmix_reassignment(
     hmmclass=hmm_sitewise,
     return_posterior=False,
 ):
-    logger.info(
-        f"Computing aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass}."
-    )
-
     N = single_X.shape[2]
     n_obs = single_X.shape[0]
     n_clones = res["new_log_mu"].shape[1]
     n_states = res["new_p_binom"].shape[0]
     single_llf = np.zeros((N, n_clones))
     new_assignment = copy.copy(prev_assignment)
+
+    logger.info(
+        f"Computing aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})."
+    )
     
     lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
     

From 75a913279074a8657c96b0d67fdf04f05933008e Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 19 Aug 2024 06:44:04 -0400
Subject: [PATCH 088/125] fix

---
 src/calicost/hmrf.py     |  6 +++---
 src/calicost/utils_IO.py | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index b6be6ca..425c8c7 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -661,7 +661,7 @@ def hmrf_pipeline(
         sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
 
-    logger.info("Merging pseudobulk by clone index")
+    logger.info("Merging pseudobulk based on clone index")
 
     X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
         single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
@@ -907,7 +907,7 @@ def hmrf_concatenate_pipeline(
 
     log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones)
 
-    logger.info("Merging pseudobulk by clone index")
+    logger.info("Merging pseudobulk based on clone index")
 
     X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
         single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index
@@ -2000,7 +2000,7 @@ def hmrfmix_concatenate_pipeline(
         sample_ids = np.array([tmp_map_index[x] for x in sample_ids])
     log_persample_weights = np.ones((n_clones, n_samples)) * (-np.log(n_clones))
 
-    logger.info("Merging pseudobulk by clone index")
+    logger.info("Merging pseudobulk based on clone index")
 
     X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(
         single_X,
diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py
index 801049c..b3a6e4a 100644
--- a/src/calicost/utils_IO.py
+++ b/src/calicost/utils_IO.py
@@ -199,8 +199,8 @@ def load_joint_data(
         columns=dict(zip(df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"])),
         inplace=True,
     )
-    logger.info(f"Input spaceranger file list {input_filelist} contains:")
-    logger.info(df_meta)
+    logger.info(f"Input spaceranger file list {input_filelist} contains:\n{df_meta}")
+
     df_barcode = pd.read_csv(
         f"{snp_dir}/barcodes.txt", header=None, names=["combined_barcode"]
     )
@@ -376,7 +376,7 @@ def load_joint_data(
     adata = adata[:, indicator]
     logger.info(adata)
     logger.info(
-        "median UMI after filtering out genes < 0.5% of cells = {}".format(
+        "Median UMI after filtering out genes < 0.5% of cells = {}".format(
             np.median(np.sum(adata.layers["count"], axis=1))
         )
     )
@@ -389,7 +389,7 @@ def load_joint_data(
         )
         adata = adata[:, indicator_filter]
         logger.info(
-            "median UMI after filtering out genes in filtergenelist_file = {}".format(
+            "Median UMI after filtering out genes in filtergenelist_file = {}".format(
                 np.median(np.sum(adata.layers["count"], axis=1))
             )
         )
@@ -430,7 +430,8 @@ def load_joint_data(
     clf = LocalOutlierFactor(n_neighbors=200)
     label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1))
     adata.layers["count"][:, np.where(label == -1)[0]] = 0
-    logger.info("filter out {} outlier genes.".format(np.sum(label == -1)))
+    
+    logger.info("Filter out {} outlier genes.".format(np.sum(label == -1)))
 
     if not normalidx_file is None:
         normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values

From 35ee3427e77da8851208772f4295132336940616 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 19 Aug 2024 07:49:05 -0400
Subject: [PATCH 089/125] fix

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 1 +
 src/calicost/hmrf.py                   | 2 +-
 src/calicost/utils_hmm.py              | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index b3ed727..eb6bfd5 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -389,6 +389,7 @@ def run_baum_welch_nb_bb(
         )
 
         for r in range(max_iter):
+            logger.info("-" * 250)
             logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.")
 
             if tumor_prop is None:
diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py
index 425c8c7..9e3263c 100644
--- a/src/calicost/hmrf.py
+++ b/src/calicost/hmrf.py
@@ -2198,7 +2198,7 @@ def hmrfmix_concatenate_pipeline(
             allres["num_iterations"] = r + 1
 
             logger.info(
-                f"Writing assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
+                f"Writing round ({r}, {k}) assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz"
             )
 
             np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres)
diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py
index e953f90..a731e19 100644
--- a/src/calicost/utils_hmm.py
+++ b/src/calicost/utils_hmm.py
@@ -1363,7 +1363,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -1842,7 +1842,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for Negative Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Negative Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -2043,7 +2043,7 @@ def update_emission_params_bb_nophasing_uniqvalues(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for Beta Binomial custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):
@@ -2273,7 +2273,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(
                 nloglikeobs2 = model.nloglikeobs(res2.params)
                 nloglikeobs = model.nloglikeobs(res.params)
 
-                logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
+                logger.info(f"Comparing loglike for Beta Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.")
                 
                 if nloglikeobs2 < nloglikeobs:
                     for s, idx_state_posweight in enumerate(state_posweights):

From 0b8720a2519e366046ec05bc057ed3d486a065fc Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Tue, 20 Aug 2024 10:19:22 -0400
Subject: [PATCH 090/125] write Weighted Beta Binom chain file

---
 src/calicost/utils_distribution_fitting.py | 84 +++++++++++++++++-----
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 1ca6d9a..6db5a15 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -1,22 +1,23 @@
+import contextlib
 import functools
 import inspect
 import logging
+import os
+import sys
+import time
 
 import numpy as np
 import scipy
-import time
-from scipy import linalg, special
-from scipy.special import logsumexp, loggamma
 import scipy.integrate
 import scipy.stats
+import statsmodels
+import statsmodels.api as sm
 from numba import jit, njit
+from scipy import linalg, special
+from scipy.special import loggamma, logsumexp
 from sklearn import cluster
 from sklearn.utils import check_random_state
-import statsmodels
-import statsmodels.api as sm
 from statsmodels.base.model import GenericLikelihoodModel
-import os
-
 
 logger = logging.getLogger(__name__)
 
@@ -40,6 +41,20 @@ def convert_params(mean, std):
     return n, p
 
 
+@contextlib.contextmanager
+def save_stdout(fpath):
+    original = sys.stdout
+    
+    with open(fpath, "w") as ff:
+        sys.stdout = ff
+        try:
+            yield
+
+        # NB teardown
+        finally:
+            sys.stdout = original
+
+
 class Weighted_NegativeBinomial(GenericLikelihoodModel):
     """
     Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion.
@@ -170,7 +185,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
 
         return result
 
-
+    
 class Weighted_BetaBinom(GenericLikelihoodModel):
     """
     Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
@@ -190,6 +205,8 @@ class Weighted_BetaBinom(GenericLikelihoodModel):
     exposure : array, (n_samples,)
         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
     """
+    ninstance = 0
+    
     def __init__(self, endog, exog, weights, exposure, **kwds):
         super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds)
 
@@ -198,12 +215,25 @@ def __init__(self, endog, exog, weights, exposure, **kwds):
         self.weights = weights
         self.exposure = exposure
 
+        # NB update the instance count
+        Weighted_BetaBinom.ninstance += 1
+        
+        
     def nloglikeobs(self, params):
         a = (self.exog @ params[:-1]) * params[-1]
         b = (1 - self.exog @ params[:-1]) * params[-1]
 
         return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
 
+    def callback(self, params):
+        nloglike = self.nloglikeobs(params)
+
+        print(params, nloglike)
+
+    @classmethod
+    def get_ninstance(cls):
+        return cls.ninstance
+        
     def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         self.exog_names.append("tau")
 
@@ -223,16 +253,33 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
 
         start = time.time()
 
-        result = super(Weighted_BetaBinom, self).fit(
-            start_params=start_params,
-            maxiter=maxiter,
-            maxfun=maxfun,
-            skip_hessian=True,
-            callback=None,
-            full_output=True,
-            retall=False,
-            **kwds
-        )
+        # NB kwds = {'xtol': 0.0001, 'ftol': 0.0001, disp: False}
+        kwds.pop("disp", None)
+
+        with save_stdout("weighted_betabinom_chain.tmp"):            
+            result = super(Weighted_BetaBinom, self).fit(
+                start_params=start_params,
+                maxiter=maxiter,
+                maxfun=maxfun,
+                skip_hessian=True,
+                callback=self.callback,
+                full_output=True,
+                retall=True,
+                disp=False,
+                **kwds
+            )
+
+        with open("weighted_betabinom_chain.tmp") as fin:
+            with open("weighted_betabinom_chain.txt", "w") as fout:
+                fout.write(f"#  Weighted_BetaBinom {Weighted_BetaBinom.get_ninstance()} @ {time.asctime()}:\n")
+                fout.write(f"start_type={start_params_str}, shape={self.endog.shape[0]}" + ", ".join(f"{key}: {value}" for key, value in result.mle_retvals.items()))
+                
+                for line in fin:
+                    fout.write(line)
+
+        os.remove("weighted_betabinom_chain.tmp")
+
+        breakpoint()
 
         # NB specific to nm (Nelder-Mead) optimization.
         niter = result.mle_retvals["iterations"]
@@ -241,6 +288,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
 
         return result
 
+    
 class Weighted_BetaBinom_mix(GenericLikelihoodModel):
     def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds):
         super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds)

From 137e4bbc26f92d42c8fdf73b95af57dd7e96d21c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Tue, 20 Aug 2024 11:30:00 -0400
Subject: [PATCH 091/125] fix

---
 src/calicost/utils_distribution_fitting.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 6db5a15..be6ccc9 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -221,14 +221,14 @@ def __init__(self, endog, exog, weights, exposure, **kwds):
         
     def nloglikeobs(self, params):
         a = (self.exog @ params[:-1]) * params[-1]
-        b = (1 - self.exog @ params[:-1]) * params[-1]
+        b = (1. - self.exog @ params[:-1]) * params[-1]
 
         return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
 
     def callback(self, params):
         nloglike = self.nloglikeobs(params)
 
-        print(params, nloglike)
+        print(params, nloglike, ";")
 
     @classmethod
     def get_ninstance(cls):
@@ -269,17 +269,20 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
                 **kwds
             )
 
+        ninst = Weighted_BetaBinom.get_ninstance()
+
+        # TODO mkdir chains
         with open("weighted_betabinom_chain.tmp") as fin:
-            with open("weighted_betabinom_chain.txt", "w") as fout:
-                fout.write(f"#  Weighted_BetaBinom {Weighted_BetaBinom.get_ninstance()} @ {time.asctime()}:\n")
-                fout.write(f"start_type={start_params_str}, shape={self.endog.shape[0]}" + ", ".join(f"{key}: {value}" for key, value in result.mle_retvals.items()))
+            with open(f"chains/weighted_betabinom_chain_{ninst}.txt", "w") as fout:
+                fout.write(f"#  Weighted_BetaBinom {ninst} @ {time.asctime()}\n")
+                fout.write(f"#  start_type:{start_params_str},shape:{self.endog.shape[0]}," + ",".join(f"{key}:{value}" for key, value in result.mle_retvals.items()) + "\n")
                 
                 for line in fin:
                     fout.write(line)
 
         os.remove("weighted_betabinom_chain.tmp")
 
-        breakpoint()
+        # breakpoint()
 
         # NB specific to nm (Nelder-Mead) optimization.
         niter = result.mle_retvals["iterations"]

From 3d46a85b0c01a018769cf6074e455b77b28bce6f Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Tue, 20 Aug 2024 11:43:37 -0400
Subject: [PATCH 092/125] finishing touches

---
 src/calicost/utils_distribution_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index be6ccc9..ea593a4 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -228,7 +228,7 @@ def nloglikeobs(self, params):
     def callback(self, params):
         nloglike = self.nloglikeobs(params)
 
-        print(params, nloglike, ";")
+        print(f"{params} {nloglike};")
 
     @classmethod
     def get_ninstance(cls):

From 9ac416b7da1368eff1567d4c3289f4378fe05cc6 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Sun, 25 Aug 2024 22:28:05 -0400
Subject: [PATCH 093/125] ABC for emission models

---
 src/calicost/utils_distribution_fitting.py | 376 ++++++---------------
 1 file changed, 112 insertions(+), 264 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index ea593a4..6551ee1 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -18,6 +18,7 @@
 from sklearn import cluster
 from sklearn.utils import check_random_state
 from statsmodels.base.model import GenericLikelihoodModel
+from abc import ABC, abstractmethod
 
 logger = logging.getLogger(__name__)
 
@@ -54,92 +55,132 @@ def save_stdout(fpath):
         finally:
             sys.stdout = original
 
-
-class Weighted_NegativeBinomial(GenericLikelihoodModel):
+class WeightedModel(GenericLikelihoodModel, ABC):
     """
-    Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion.
-    This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
-
-    Attributes
-    ----------
-    endog : array, (n_samples,)
-        Y values.
-
-    exog : array, (n_samples, n_features)
-        Design matrix.
-
-    weights : array, (n_samples,)
-        Sample weights.
-
-    exposure : array, (n_samples,)
+    An ABC for defined emission models.
+
+    Attributes                                                                                                                                                                                                                                                                                     
+    ----------                                                                                                                                                                                                                                                                                     
+    endog : array, (n_samples,)                                                                                                                                                                                                                                                                    
+        Y values.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
+    exog : array, (n_samples, n_features)                                                                                                                                                                                                                                                          
+        Design matrix.                                                                                                                                                                                                                                                                             
+                                                                                                                                                                                                                                                                                                   
+    weights : array, (n_samples,)                                                                                                                                                                                                                                                                  
+        Sample weights.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+    exposure : array, (n_samples,)                                                                                                                                                                                                                                                                
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
-    def __init__(self, endog, exog, weights, exposure, seed=0, **kwds):
-        super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds)
-
-        logger.info(f"Initializing Weighted_NegativeBinomial model for endog.shape = {endog.shape}.")
-
+    def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwds):
+        super(WeightedModel, self).__init__(endog, exog, **kwds)
+        
         self.weights = weights
         self.exposure = exposure
+
+        # NB Weight_BetaBinomial does not specify seed
         self.seed = seed
+        self.tumor_prop = tumor_prop
 
-    def nloglikeobs(self, params):
-        nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
-        nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
+        self.__post_init__()
+                
+        logger.info(f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}.")
         
-        n, p = convert_params(nb_mean, nb_std)
+    @abstractmethod
+    def nloglikeobs(self, params):
+        pass
 
-        return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
+    @abstractmethod
+    def get_default_start_params(self):
+        pass
+
+    @abstractmethod
+    def get_ext_param_name(self):
+        pass
+
+    @classmethod
+    @abstractmethod
+    def get_ninstance(cls):
+        pass
+    
+    @abstractmethod
+    def __post_init__(self):
+        # NB will increment the instance count for each derived class.
+        pass
+    
+    def callback(self, params):
+        nloglike = self.nloglikeobs(params)
 
+        print(f"{params} {nloglike};")
+    
     def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
-        self.exog_names.append("alpha")
+        ext_param_name = self.get_ext_param_name()
 
+        self.exog_names.append(ext_param_name)
+        
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
                 start_params_str = "existing"
-                
             else:
-                start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
+                start_params = self.default_start_params()
                 start_params_str = "default"
         else:
             start_params_str = "input"
 
-        logger.info(f"Starting Weighted_NegativeBinomial optimization @ ({start_params_str}) {start_params}.")
-
+        logger.info(f"Starting {self.__class__.__name__} optimization @ ({start_params_str}) {start_params}.")
+        
         start = time.time()
 
-        # NB see https://www.statsmodels.org/dev/dev/generated/statsmodels.base.model.LikelihoodModelResults.html
-        result = super(Weighted_NegativeBinomial, self).fit(
+        result = super(Weighted_Model, self).fit(
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
             skip_hessian=True,
-            callback=None,
+            callback=self.callback,
             full_output=True,
-            retall=False,
+            retall=True,
+            disp=False,
             **kwds
         )
 
-        # NB specific to nm (Nelder-Mead) optimization.
+        # NB specific to nm (Nelder-Mead) optimization.                                                                                                                                                                                                                                            
         niter = result.mle_retvals["iterations"]
 
-        logger.info(f"Finished Weighted_NegativeBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+        logger.info(f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
 
         return result
 
+class Weighted_NegativeBinomial(WeightedModel):
+    """
+    Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion.
+    This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
+    """
+    ninstance = 0
+    
+    def nloglikeobs(self, params):
+        nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
+        nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
+        
+        n, p = convert_params(nb_mean, nb_std)
 
-class Weighted_NegativeBinomial_mix(GenericLikelihoodModel):
-    def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds):
-        super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds)
+        return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
 
-        logger.info(f"Initializing Weighted_NegativeBinomial_mix model for endog.shape = {endog.shape}.")
+    def get_default_start_params(self):
+        return np.append(0.1 * np.ones(self.exog.shape[1]), 0.01)
 
-        self.weights = weights
-        self.exposure = exposure
-        self.seed = seed
-        self.tumor_prop = tumor_prop
+    def get_ext_param_name():
+        return "alpha"
 
+    def __post_init__(self):
+        Weighted_NegativeBinomial.ninstance += 1
+        
+    @classmethod
+    def get_ninstance(cls):
+        return cls.ninstance
+        
+class Weighted_NegativeBinomial_mix(WeightedModel):
+    ninstance = 0
+    
     def nloglikeobs(self, params):
         nb_mean = self.exposure * (
             self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop
@@ -149,159 +190,38 @@ def nloglikeobs(self, params):
         n, p = convert_params(nb_mean, nb_std)
 
         return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
+    
+    def get_default_start_params(self):
+        return np.append(0.1 * np.ones(self.nparams), 0.01)
 
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
-        self.exog_names.append("alpha")
-
-        if start_params is None:
-            if hasattr(self, "start_params"):
-                start_params = self.start_params
-                start_params_str = "existing"
-            else:
-                start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
-                start_params_str = "default"
-        else:
-            start_params_str = "input"
-                
-        logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ ({start_params_str}) {start_params}.")
-
-        start = time.time()
-
-        result = super(Weighted_NegativeBinomial_mix, self).fit(
-            start_params=start_params,
-            maxiter=maxiter,
-            maxfun=maxfun,
-            skip_hessian=True,
-            callback=None,
-            full_output=True,
-            retall=False,
-            **kwds
-        )
-
-        # NB specific to nm (Nelder-Mead) optimization.
-        niter = result.mle_retvals["iterations"]       
-
-        logger.info(f"Finished Weighted_NegativeBinomial_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
-
-        return result
+    def get_ext_param_name(self):
+        return "alpha"
 
-    
-class Weighted_BetaBinom(GenericLikelihoodModel):
+    def __post_init__(self):
+        assert self.tumor_prop is not None, "Tumor proportion must be defined."
+        
+class Weighted_BetaBinom(WeightedModel):
     """
     Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
     This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
-
-    Attributes
-    ----------
-    endog : array, (n_samples,)
-        Y values.
-
-    exog : array, (n_samples, n_features)
-        Design matrix.
-
-    weights : array, (n_samples,)
-        Sample weights.
-
-    exposure : array, (n_samples,)
-        Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
     """
     ninstance = 0
     
-    def __init__(self, endog, exog, weights, exposure, **kwds):
-        super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds)
-
-        logger.info(f"Initializing Weighted_BetaBinomial model for endog.shape = {endog.shape}.")
-
-        self.weights = weights
-        self.exposure = exposure
-
-        # NB update the instance count
-        Weighted_BetaBinom.ninstance += 1
-        
-        
     def nloglikeobs(self, params):
         a = (self.exog @ params[:-1]) * params[-1]
         b = (1. - self.exog @ params[:-1]) * params[-1]
 
         return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
 
-    def callback(self, params):
-        nloglike = self.nloglikeobs(params)
-
-        print(f"{params} {nloglike};")
-
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
-        
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
-        self.exog_names.append("tau")
-
-        if start_params is None:
-            if hasattr(self, "start_params"):
-                start_params = self.start_params
-                start_params_str = "existing"
-            else:
-                start_params = np.append(
-                    0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
-                )
-                start_params_str = "default"
-        else:
-            start_params_str = "input"
-
-        logger.info(f"Starting Weighted_BetaBinomial optimization @ ({start_params_str}) {start_params}.")
-
-        start = time.time()
-
-        # NB kwds = {'xtol': 0.0001, 'ftol': 0.0001, disp: False}
-        kwds.pop("disp", None)
-
-        with save_stdout("weighted_betabinom_chain.tmp"):            
-            result = super(Weighted_BetaBinom, self).fit(
-                start_params=start_params,
-                maxiter=maxiter,
-                maxfun=maxfun,
-                skip_hessian=True,
-                callback=self.callback,
-                full_output=True,
-                retall=True,
-                disp=False,
-                **kwds
-            )
-
-        ninst = Weighted_BetaBinom.get_ninstance()
-
-        # TODO mkdir chains
-        with open("weighted_betabinom_chain.tmp") as fin:
-            with open(f"chains/weighted_betabinom_chain_{ninst}.txt", "w") as fout:
-                fout.write(f"#  Weighted_BetaBinom {ninst} @ {time.asctime()}\n")
-                fout.write(f"#  start_type:{start_params_str},shape:{self.endog.shape[0]}," + ",".join(f"{key}:{value}" for key, value in result.mle_retvals.items()) + "\n")
-                
-                for line in fin:
-                    fout.write(line)
-
-        os.remove("weighted_betabinom_chain.tmp")
-
-        # breakpoint()
-
-        # NB specific to nm (Nelder-Mead) optimization.
-        niter = result.mle_retvals["iterations"]
-
-        logger.info(f"Finished Weighted_BetaBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
-
-        return result
+    def get_default_start_params(self):
+        return np.append(
+            0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
+        )
 
+    def get_ext_param_name():
+        return "tau"
     
-class Weighted_BetaBinom_mix(GenericLikelihoodModel):
-    def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds):
-        super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds)
-
-        logger.info(f"Initializing Weighted_BetaBinom_mix model for endog.shape = {endog.shape}.")
-
-        self.weights = weights
-        self.exposure = exposure
-        self.tumor_prop = tumor_prop
-
+class Weighted_BetaBinom_mix(WeightedModel_mix):
     def nloglikeobs(self, params):
         a = (
             self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
@@ -314,44 +234,17 @@ def nloglikeobs(self, params):
 
         return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
 
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
-        self.exog_names.append("tau")
-
-        if start_params is None:
-            if hasattr(self, "start_params"):
-                start_params = self.start_params
-                start_params_str = "existing"
-            else:
-                start_params = np.append(
-                    0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
-                )
-                start_params_str = "default"
-        else:
-            start_params_str = "input"
-
-        logger.info(f"Starting Weighted_BetaBinom_mix optimization @ ({start_params_str}) {start_params}.")
-
-        start = time.time()
-
-        result = super(Weighted_BetaBinom_mix, self).fit(
-            start_params=start_params,
-            maxiter=maxiter,
-            maxfun=maxfun,
-            skip_hessian=True,
-            callback=None,
-            full_output=True,
-            retall=False,
-            **kwds
+    def get_default_start_params(self):
+        return np.append(
+            0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
         )
+    
+    def get_ext_param_name():
+        return "tau"
 
-        # NB specific to nm (Nelder-Mead) optimization.
-        niter = result.mle_retvals["iterations"]
-
-        logger.info(f"Finished Weighted_BetaBinom_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
-
-        return result
-
-
+    def	__post_init__(self):
+        assert self.tumor_prop is not None, "Tumor proportion must be defined."
+    
 class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
     def __init__(self, endog, exog, tau, weights, exposure, **kwds):
         super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds)
@@ -456,48 +349,3 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         logger.info(f"Finished Weighted_BetaBinom_fixdispersion_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
 
         return result
-
-# DEPRECATE
-class BAF_Binom(GenericLikelihoodModel):
-    """
-    Binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
-    This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
-
-    Attributes
-    ----------
-    endog : array, (n_samples,)
-        Y values.
-
-    exog : array, (n_samples, n_features)
-        Design matrix.
-
-    weights : array, (n_samples,)
-        Sample weights.
-
-    exposure : array, (n_samples,)
-        Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
-    """
-    def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds):
-        super(BAF_Binom, self).__init__(endog, exog, **kwds)
-        
-        self.weights = weights
-        self.exposure = exposure
-        self.offset = offset
-        self.scaling = scaling
-
-    def nloglikeobs(self, params):
-        linear_term = self.exog @ params
-        p = self.scaling / (1 + np.exp(-linear_term + self.offset))
-
-        return -scipy.stats.binom.logpmf(self.endog, self.exposure, p).dot(self.weights)
-        
-    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
-        if start_params is None:
-            if hasattr(self, "start_params"):
-                start_params = self.start_params
-            else:
-                start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams)
-                
-        return super(BAF_Binom, self).fit(
-            start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds
-        )

From 2bd629dd7917b6226e6399a6df5a4b50b4b60063 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:17:44 -0400
Subject: [PATCH 094/125] abc for emission models

---
 src/calicost/utils_distribution_fitting.py | 262 ++++++++++-----------
 1 file changed, 128 insertions(+), 134 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 6551ee1..1675f91 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -5,6 +5,7 @@
 import os
 import sys
 import time
+from abc import ABC, abstractmethod
 
 import numpy as np
 import scipy
@@ -18,7 +19,6 @@
 from sklearn import cluster
 from sklearn.utils import check_random_state
 from statsmodels.base.model import GenericLikelihoodModel
-from abc import ABC, abstractmethod
 
 logger = logging.getLogger(__name__)
 
@@ -30,6 +30,7 @@
 os.environ["OPENBLAS_NUM_THREADS"] = num_threads
 os.environ["OMP_NUM_THREADS"] = num_threads
 
+
 def convert_params(mean, std):
     """
     Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports
@@ -45,46 +46,49 @@ def convert_params(mean, std):
 @contextlib.contextmanager
 def save_stdout(fpath):
     original = sys.stdout
-    
+
     with open(fpath, "w") as ff:
         sys.stdout = ff
+
         try:
             yield
 
-        # NB teardown
         finally:
             sys.stdout = original
 
+
 class WeightedModel(GenericLikelihoodModel, ABC):
     """
     An ABC for defined emission models.
 
-    Attributes                                                                                                                                                                                                                                                                                     
-    ----------                                                                                                                                                                                                                                                                                     
-    endog : array, (n_samples,)                                                                                                                                                                                                                                                                    
-        Y values.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
-    exog : array, (n_samples, n_features)                                                                                                                                                                                                                                                          
-        Design matrix.                                                                                                                                                                                                                                                                             
-                                                                                                                                                                                                                                                                                                   
-    weights : array, (n_samples,)                                                                                                                                                                                                                                                                  
-        Sample weights.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
-    exposure : array, (n_samples,)                                                                                                                                                                                                                                                                
+    Attributes                                                                                                                                                                                                                                                                      ----------
+    endog : array, (n_samples,)                                                                                                                                                                                                                                                         Y values.
+    exog : array, (n_samples, n_features)
+        Design matrix.
+    weights : array, (n_samples,)
+        Sample weights.
+    exposure : array, (n_samples,)
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
-    def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwds):
-        super(WeightedModel, self).__init__(endog, exog, **kwds)
-        
+
+    def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
+        super().__init__(endog, exog, **kwargs)
+
+        # NB unpack a single additional positional argument as tumor_proportion.
+        self.tumor_prop = args if len(args) == 1 else None
+
         self.weights = weights
         self.exposure = exposure
 
         # NB Weight_BetaBinomial does not specify seed
         self.seed = seed
-        self.tumor_prop = tumor_prop
 
         self.__post_init__()
-                
-        logger.info(f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}.")
-        
+
+        logger.info(
+            f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}."
+        )
+
     @abstractmethod
     def nloglikeobs(self, params):
         pass
@@ -97,26 +101,23 @@ def get_default_start_params(self):
     def get_ext_param_name(self):
         pass
 
-    @classmethod
     @abstractmethod
-    def get_ninstance(cls):
+    def get_ninstance(self):
         pass
-    
+
     @abstractmethod
     def __post_init__(self):
         # NB will increment the instance count for each derived class.
         pass
-    
-    def callback(self, params):
-        nloglike = self.nloglikeobs(params)
 
-        print(f"{params} {nloglike};")
-    
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
+    def __callback__(self, params):
+        print(f"{params} {self.nloglikeobs(params)};")
+
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
         ext_param_name = self.get_ext_param_name()
 
         self.exog_names.append(ext_param_name)
-        
+
         if start_params is None:
             if hasattr(self, "start_params"):
                 start_params = self.start_params
@@ -127,40 +128,49 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
         else:
             start_params_str = "input"
 
-        logger.info(f"Starting {self.__class__.__name__} optimization @ ({start_params_str}) {start_params}.")
-        
+        logger.info(
+            f"Starting {self.__class__.__name__} optimization @ ({start_params_str}) {start_params}."
+        )
+
         start = time.time()
 
-        result = super(Weighted_Model, self).fit(
+        result = super().fit(
             start_params=start_params,
             maxiter=maxiter,
             maxfun=maxfun,
             skip_hessian=True,
-            callback=self.callback,
+            callback=self.__callback__,
             full_output=True,
             retall=True,
             disp=False,
-            **kwds
+            **kwargs,
         )
 
-        # NB specific to nm (Nelder-Mead) optimization.                                                                                                                                                                                                                                            
+        # NB specific to nm (Nelder-Mead) optimization.
         niter = result.mle_retvals["iterations"]
 
-        logger.info(f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+        logger.info(
+            f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations."
+        )
 
         return result
 
+
 class Weighted_NegativeBinomial(WeightedModel):
     """
-    Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion.
-    This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
+    Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]),
+    where exog is the design matrix, and params[-1] is 1 / overdispersion.  This function
+    fits the NB params when samples are weighted by weights:
+
+    max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
+
     ninstance = 0
-    
+
     def nloglikeobs(self, params):
         nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
-        
+
         n, p = convert_params(nb_mean, nb_std)
 
         return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
@@ -172,25 +182,27 @@ def get_ext_param_name():
         return "alpha"
 
     def __post_init__(self):
-        Weighted_NegativeBinomial.ninstance += 1
-        
+        pass
+
     @classmethod
     def get_ninstance(cls):
         return cls.ninstance
-        
+
+
 class Weighted_NegativeBinomial_mix(WeightedModel):
     ninstance = 0
-    
+
     def nloglikeobs(self, params):
         nb_mean = self.exposure * (
             self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop
         )
+
         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
 
         n, p = convert_params(nb_mean, nb_std)
 
         return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights)
-    
+
     def get_default_start_params(self):
         return np.append(0.1 * np.ones(self.nparams), 0.01)
 
@@ -199,29 +211,37 @@ def get_ext_param_name(self):
 
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
-        
+
+
 class Weighted_BetaBinom(WeightedModel):
     """
-    Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
-    This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
+    Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)),
+    where p = exog @ params[:-1] and tau = params[-1].  This function fits the
+    BetaBin params when samples are weighted by weights:
+
+    max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
+
     ninstance = 0
-    
+
     def nloglikeobs(self, params):
         a = (self.exog @ params[:-1]) * params[-1]
-        b = (1. - self.exog @ params[:-1]) * params[-1]
+        b = (1.0 - self.exog @ params[:-1]) * params[-1]
 
-        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(
+            self.weights
+        )
 
     def get_default_start_params(self):
-        return np.append(
-            0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
-        )
+        return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
 
     def get_ext_param_name():
         return "tau"
-    
+
+
 class Weighted_BetaBinom_mix(WeightedModel_mix):
+    ninstance = 0
+
     def nloglikeobs(self, params):
         a = (
             self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
@@ -232,79 +252,78 @@ def nloglikeobs(self, params):
             + 0.5 * (1 - self.tumor_prop)
         ) * params[-1]
 
-        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(
+            self.weights
+        )
 
     def get_default_start_params(self):
-        return np.append(
-            0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1
-        )
-    
+        return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
+
     def get_ext_param_name():
         return "tau"
 
-    def	__post_init__(self):
+    def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
-    
-class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
-    def __init__(self, endog, exog, tau, weights, exposure, **kwds):
-        super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds)
 
-        logger.info(f"Initializing Weighted_BetaBinom_fixdispersion model for endog.shape = {endog.shape}.")
+
+class Weighted_BetaBinom_fixdispersion(WeightedModel):
+    ninstance = 0
+
+    # NB custom __init__ required to handle tau.
+    def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs):
+        super().__init__(endog, exog, **kwargs)
+
+        # NB unpack a single additional positional argument as tumor_proportion.
+        self.tumor_prop = args if len(args) == 1 else None
 
         self.tau = tau
         self.weights = weights
         self.exposure = exposure
 
-    def nloglikeobs(self, params):
-        a = (self.exog @ params) * self.tau
-        b = (1 - self.exog @ params) * self.tau
+        # NB Weighted_BetaBinom_fixdispersion does not specify seed previously.
+        self.seed = seed
 
-        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
+        self.__post_init__()
 
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
-        if start_params is None:
-            if hasattr(self, "start_params"):
-                start_params = self.start_params
-                start_params_str = "existing"
-            else:
-                start_params = 0.1 * np.ones(self.nparams)
-                start_params_str = "default"
-        else:
-            start_params_str = "input"
-                
-        logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ ({start_params_str}) {start_params}.")
+        logger.info(
+            f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}."
+        )
 
-        start = time.time()
+    def nloglikeobs(self, params):
+        a = (self.exog @ params) * self.tau
+        b = (1 - self.exog @ params) * self.tau
 
-        result = super(Weighted_BetaBinom_fixdispersion, self).fit(
-            start_params=start_params,
-            maxiter=maxiter,
-            maxfun=maxfun,
-            skip_hessian=True,
-            callback=None,
-            full_output=True,
-            retall=False,
-            **kwds
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(
+            self.weights
         )
 
-        # NB specific to nm (Nelder-Mead) optimization.
-        niter = result.mle_retvals["iterations"]
-
-        logger.info(f"Finished Weighted_BetaBinom_fixdispersion optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+    def get_default_start_params(self):
+        return 0.1 * np.ones(self.nparams)
 
-        return result
+    def __post_init__(self):
+        pass
 
 
-class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel):
-    def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds):
-        super(Weighted_BetaBinom_fixdispersion_mix, self).__init__(endog, exog, **kwds)
+class Weighted_BetaBinom_fixdispersion_mix(WeightedModel):
+    # NB custom __init__ required to handle tau.
+    def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs):
+        super().__init__(endog, exog, **kwargs)
 
-        logger.info(f"Initializing Weighted_BetaBinom_fixdispersion_mix model for endog.shape = {endog.shape}.")
+        # NB unpack a single additional positional argument as tumor_proportion.
+        self.tumor_prop = args if len(args) == 1 else None
 
         self.tau = tau
         self.weights = weights
         self.exposure = exposure
-        self.tumor_prop = tumor_prop
+
+        # NB Weighted_BetaBinom_fixdispersion does not specify seed previously.
+        self.seed = seed
+
+        self.__post_init__()
+
+        logger.info(
+            f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}."
+        )
 
     def nloglikeobs(self, params):
         a = (
@@ -315,37 +334,12 @@ def nloglikeobs(self, params):
             (1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)
         ) * self.tau
 
-        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights)
-
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds):
-        if start_params is None:
-            if hasattr(self, "start_params"):
-                start_params = self.start_params
-                start_params_str = "existing"
-            else:
-                start_params = 0.1 * np.ones(self.nparams)
-                start_params_str = "default"
-        else:
-            start_params_str = "input"
-                
-        logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ ({start_params_str}) {start_params}.")
-
-        start = time.time()
-
-        result = super(Weighted_BetaBinom_fixdispersion_mix, self).fit(
-            start_params=start_params,
-            maxiter=maxiter,
-            maxfun=maxfun,
-            skip_hessian=True,
-            callback=None,
-            full_output=True,
-            retall=False,
-            **kwds
+        return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(
+            self.weights
         )
 
-        # NB specific to nm (Nelder-Mead) optimization.
-        niter = result.mle_retvals["iterations"]
-
-        logger.info(f"Finished Weighted_BetaBinom_fixdispersion_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.")
+    def get_default_start_params(self):
+        return 0.1 * np.ones(self.nparams)
 
-        return result
+    def __post_init__(self):
+        assert self.tumor_prop is not None, "Tumor proportion must be defined."

From d4a059968dbe96e5bcf9d430515585a7b612aca6 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:19:33 -0400
Subject: [PATCH 095/125] fix

---
 src/calicost/utils_distribution_fitting.py | 23 +++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 1675f91..9885dbe 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -164,7 +164,6 @@ class Weighted_NegativeBinomial(WeightedModel):
 
     max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
-
     ninstance = 0
 
     def nloglikeobs(self, params):
@@ -212,6 +211,10 @@ def get_ext_param_name(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
+    @classmethod
+    def get_ninstance(cls):
+	return cls.ninstance
+        
 
 class Weighted_BetaBinom(WeightedModel):
     """
@@ -238,6 +241,13 @@ def get_default_start_params(self):
     def get_ext_param_name():
         return "tau"
 
+    def __post_init__(self):
+        pass
+    
+    @classmethod
+    def get_ninstance(cls):
+	return cls.ninstance
+
 
 class Weighted_BetaBinom_mix(WeightedModel_mix):
     ninstance = 0
@@ -265,6 +275,9 @@ def get_ext_param_name():
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
+    @classmethod
+    def get_ninstance(cls):
+        return cls.ninstance
 
 class Weighted_BetaBinom_fixdispersion(WeightedModel):
     ninstance = 0
@@ -303,6 +316,10 @@ def get_default_start_params(self):
     def __post_init__(self):
         pass
 
+    @classmethod
+    def get_ninstance(cls):
+        return cls.ninstance
+
 
 class Weighted_BetaBinom_fixdispersion_mix(WeightedModel):
     # NB custom __init__ required to handle tau.
@@ -343,3 +360,7 @@ def get_default_start_params(self):
 
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
+
+    @classmethod
+    def get_ninstance(cls):
+        return cls.ninstance

From 431936c3798f3dcf570807678ce8d7814ad2c0da Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:22:47 -0400
Subject: [PATCH 096/125] fixes

---
 src/calicost/utils_distribution_fitting.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 9885dbe..ccd6828 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -213,7 +213,7 @@ def __post_init__(self):
 
     @classmethod
     def get_ninstance(cls):
-	return cls.ninstance
+        return cls.ninstance
         
 
 class Weighted_BetaBinom(WeightedModel):
@@ -246,10 +246,10 @@ def __post_init__(self):
     
     @classmethod
     def get_ninstance(cls):
-	return cls.ninstance
+        return cls.ninstance
 
 
-class Weighted_BetaBinom_mix(WeightedModel_mix):
+class Weighted_BetaBinom_mix(WeightedModel):
     ninstance = 0
 
     def nloglikeobs(self, params):

From bed12f19ca1beb1f2e0cc06f60a8abfeddb31b6e Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:29:41 -0400
Subject: [PATCH 097/125] fix

---
 src/calicost/utils_distribution_fitting.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index ccd6828..0e98b66 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -110,6 +110,11 @@ def __post_init__(self):
         # NB will increment the instance count for each derived class.
         pass
 
+    @classmethod
+    @abstractmethod
+    def get_ninstance(cls):
+        pass
+
     def __callback__(self, params):
         print(f"{params} {self.nloglikeobs(params)};")
 
@@ -123,7 +128,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
                 start_params = self.start_params
                 start_params_str = "existing"
             else:
-                start_params = self.default_start_params()
+                start_params = self.get_default_start_params()
                 start_params_str = "default"
         else:
             start_params_str = "input"
@@ -238,7 +243,7 @@ def nloglikeobs(self, params):
     def get_default_start_params(self):
         return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
 
-    def get_ext_param_name():
+    def get_ext_param_name(self):
         return "tau"
 
     def __post_init__(self):

From ef27f84e979b1966809e621d0aa02c9fc38121cc Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:39:52 -0400
Subject: [PATCH 098/125] fix chain logging

---
 src/calicost/utils_distribution_fitting.py | 87 ++++++++++++++--------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 0e98b66..f65b84a 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -110,9 +110,8 @@ def __post_init__(self):
         # NB will increment the instance count for each derived class.
         pass
 
-    @classmethod
     @abstractmethod
-    def get_ninstance(cls):
+    def get_ninstance(self):
         pass
 
     def __callback__(self, params):
@@ -139,17 +138,43 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
 
         start = time.time()
 
-        result = super().fit(
-            start_params=start_params,
-            maxiter=maxiter,
-            maxfun=maxfun,
-            skip_hessian=True,
-            callback=self.__callback__,
-            full_output=True,
-            retall=True,
-            disp=False,
-            **kwargs,
-        )
+        # NB kwargs = {'xtol': 0.0001, 'ftol': 0.0001, disp: False}
+        kwargs.pop("disp", None)
+
+        tmp_path = f"{self.__class__.__name__.lower()}_chain.tmp"
+
+        # TODO mkdir chains
+        ninst = self.get_ninstance()
+        final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt"
+
+        with save_stdout(tmp_path):
+            result = super().fit(
+                start_params=start_params,
+                maxiter=maxiter,
+                maxfun=maxfun,
+                skip_hessian=True,
+                callback=self.__callback__,
+                full_output=True,
+                retall=True,
+                disp=False,
+                **kwargs,
+            )
+
+        with open(tmp_path) as fin:
+            with open(final_path, "w") as fout:
+                fout.write(f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n")
+                fout.write(
+                    f"#  start_type:{start_params_str},shape:{self.endog.shape[0]},"
+                    + ",".join(
+                        f"{key}:{value}" for key, value in result.mle_retvals.items()
+                    )
+                    + "\n"
+                )
+
+                for line in fin:
+                    fout.write(line)
+
+        os.remove(tmp_path)
 
         # NB specific to nm (Nelder-Mead) optimization.
         niter = result.mle_retvals["iterations"]
@@ -169,6 +194,7 @@ class Weighted_NegativeBinomial(WeightedModel):
 
     max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
+
     ninstance = 0
 
     def nloglikeobs(self, params):
@@ -188,9 +214,8 @@ def get_ext_param_name():
     def __post_init__(self):
         pass
 
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
+    def get_ninstance(self):
+        return self.ninstance
 
 
 class Weighted_NegativeBinomial_mix(WeightedModel):
@@ -216,10 +241,9 @@ def get_ext_param_name(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
-        
+    def get_ninstance(self):
+        return self.ninstance
+
 
 class Weighted_BetaBinom(WeightedModel):
     """
@@ -248,10 +272,9 @@ def get_ext_param_name(self):
 
     def __post_init__(self):
         pass
-    
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
+
+    def get_ninstance(self):
+        return self.ninstance
 
 
 class Weighted_BetaBinom_mix(WeightedModel):
@@ -280,9 +303,9 @@ def get_ext_param_name():
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
+    def get_ninstance(self):
+        return self.ninstance
+
 
 class Weighted_BetaBinom_fixdispersion(WeightedModel):
     ninstance = 0
@@ -321,9 +344,8 @@ def get_default_start_params(self):
     def __post_init__(self):
         pass
 
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
+    def get_ninstance(self):
+        return self.ninstance
 
 
 class Weighted_BetaBinom_fixdispersion_mix(WeightedModel):
@@ -366,6 +388,5 @@ def get_default_start_params(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
-    @classmethod
-    def get_ninstance(cls):
-        return cls.ninstance
+    def get_ninstance(self):
+        return self.ninstance

From 22bceb7006eb05d27d422f3853b1038277c26515 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:42:34 -0400
Subject: [PATCH 099/125] fix

---
 src/calicost/utils_distribution_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index f65b84a..f3ea3e7 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -86,7 +86,7 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
         self.__post_init__()
 
         logger.info(
-            f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}."
+            f"Initializing {self.get_ninstance()}th instance of {self.__class__.__name__} model for endog.shape = {endog.shape}."
         )
 
     @abstractmethod

From 5787061df5120e7e08dcb996ca2853ce390ac806 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:48:43 -0400
Subject: [PATCH 100/125] gzip chains

---
 src/calicost/utils_distribution_fitting.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index f3ea3e7..1a5c5a8 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -1,5 +1,6 @@
 import contextlib
 import functools
+import gzip
 import inspect
 import logging
 import os
@@ -161,7 +162,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
             )
 
         with open(tmp_path) as fin:
-            with open(final_path, "w") as fout:
+            with gzip.open(final_path, "wt") as fout:
                 fout.write(f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n")
                 fout.write(
                     f"#  start_type:{start_params_str},shape:{self.endog.shape[0]},"

From 16f10e1189afe7c6795336bf59a86460ea02638e Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:52:00 -0400
Subject: [PATCH 101/125] fix

---
 src/calicost/utils_distribution_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 1a5c5a8..25e9186 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -146,7 +146,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
 
         # TODO mkdir chains
         ninst = self.get_ninstance()
-        final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt"
+        final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt.gzip"
 
         with save_stdout(tmp_path):
             result = super().fit(

From 0424169b7abed422eb8b7501dfcb0ec65cddf6dd Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:53:06 -0400
Subject: [PATCH 102/125] fix

---
 src/calicost/utils_distribution_fitting.py | 25 +++++++++++-----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 25e9186..58eb357 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -118,7 +118,7 @@ def get_ninstance(self):
     def __callback__(self, params):
         print(f"{params} {self.nloglikeobs(params)};")
 
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
+    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs):
         ext_param_name = self.get_ext_param_name()
 
         self.exog_names.append(ext_param_name)
@@ -161,19 +161,20 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs):
                 **kwargs,
             )
 
-        with open(tmp_path) as fin:
-            with gzip.open(final_path, "wt") as fout:
-                fout.write(f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n")
-                fout.write(
-                    f"#  start_type:{start_params_str},shape:{self.endog.shape[0]},"
-                    + ",".join(
-                        f"{key}:{value}" for key, value in result.mle_retvals.items()
+        if write_chain:
+            with open(tmp_path) as fin:
+                with gzip.open(final_path, "wt") as fout:
+                    fout.write(f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n")
+                    fout.write(
+                        f"#  start_type:{start_params_str},shape:{self.endog.shape[0]},"
+                        + ",".join(
+                            f"{key}:{value}" for key, value in result.mle_retvals.items()
+                        )
+                        + "\n"
                     )
-                    + "\n"
-                )
 
-                for line in fin:
-                    fout.write(line)
+                    for line in fin:
+                        fout.write(line)
 
         os.remove(tmp_path)
 

From 207e873a745c46ff55a012d1bf066d2e3011419b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 10:55:49 -0400
Subject: [PATCH 103/125] update instance counts.

---
 src/calicost/utils_distribution_fitting.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 58eb357..8fc9cc8 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -214,7 +214,7 @@ def get_ext_param_name():
         return "alpha"
 
     def __post_init__(self):
-        pass
+        Weighted_NegativeBinomial.ninstance += 1
 
     def get_ninstance(self):
         return self.ninstance
@@ -243,6 +243,8 @@ def get_ext_param_name(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
+        Weighted_NegativeBinomial_mix.ninstance
+        
     def get_ninstance(self):
         return self.ninstance
 
@@ -255,7 +257,6 @@ class Weighted_BetaBinom(WeightedModel):
 
     max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
-
     ninstance = 0
 
     def nloglikeobs(self, params):
@@ -273,7 +274,7 @@ def get_ext_param_name(self):
         return "tau"
 
     def __post_init__(self):
-        pass
+        Weighted_BetaBinom.ninstance += 1
 
     def get_ninstance(self):
         return self.ninstance
@@ -305,6 +306,8 @@ def get_ext_param_name():
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
+        Weighted_BetaBinom_mix.ninstance +=	1
+        
     def get_ninstance(self):
         return self.ninstance
 
@@ -344,7 +347,7 @@ def get_default_start_params(self):
         return 0.1 * np.ones(self.nparams)
 
     def __post_init__(self):
-        pass
+        Weighted_BetaBinom_fixdispersion.ninstance += 1 
 
     def get_ninstance(self):
         return self.ninstance
@@ -390,5 +393,7 @@ def get_default_start_params(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
+         Weighted_BetaBinom_fixdispersion_mix.ninstance += 1
+        
     def get_ninstance(self):
         return self.ninstance

From f945854396c54e52f4ff956733df49e7fc1a36d6 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:02:05 -0400
Subject: [PATCH 104/125] fix

---
 src/calicost/utils_distribution_fitting.py | 29 ++++++++++++++++------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 8fc9cc8..4c9f548 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -62,7 +62,8 @@ class WeightedModel(GenericLikelihoodModel, ABC):
     """
     An ABC for defined emission models.
 
-    Attributes                                                                                                                                                                                                                                                                      ----------
+    Attributes                                                                                                                                                                                                                                                                 
+    ----------
     endog : array, (n_samples,)                                                                                                                                                                                                                                                         Y values.
     exog : array, (n_samples, n_features)
         Design matrix.
@@ -71,7 +72,6 @@ class WeightedModel(GenericLikelihoodModel, ABC):
     exposure : array, (n_samples,)
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
-
     def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
         super().__init__(endog, exog, **kwargs)
 
@@ -84,6 +84,7 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
         # NB Weight_BetaBinomial does not specify seed
         self.seed = seed
 
+        # NB __pos_init__ validates the expected tumor proportion and handles incrementing instance count.
         self.__post_init__()
 
         logger.info(
@@ -92,6 +93,9 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
 
     @abstractmethod
     def nloglikeobs(self, params):
+        """
+        Negative log-likelihood for the emission model.
+        """
         pass
 
     @abstractmethod
@@ -100,22 +104,31 @@ def get_default_start_params(self):
 
     @abstractmethod
     def get_ext_param_name(self):
+        """
+        Named parameter in the model.
+        """
         pass
 
     @abstractmethod
     def get_ninstance(self):
+        """
+        Return the instance count for the given model
+        """
         pass
 
     @abstractmethod
     def __post_init__(self):
-        # NB will increment the instance count for each derived class.
-        pass
-
-    @abstractmethod
-    def get_ninstance(self):
+        """
+        Validation and customisation for the derived class.
+        E.g. validate the tumor_proportion and increment the instance
+        count of the derived class.
+        """
         pass
 
     def __callback__(self, params):
+        """
+        Define callback for writing parameter chain to file.
+        """
         print(f"{params} {self.nloglikeobs(params)};")
 
     def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs):
@@ -393,7 +406,7 @@ def get_default_start_params(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
-         Weighted_BetaBinom_fixdispersion_mix.ninstance += 1
+        Weighted_BetaBinom_fixdispersion_mix.ninstance += 1
         
     def get_ninstance(self):
         return self.ninstance

From 16132593e973ff8c78feb87a7e4049c77aa17436 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:06:34 -0400
Subject: [PATCH 105/125] fix

---
 src/calicost/utils_distribution_fitting.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 4c9f548..2e21437 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -174,12 +174,20 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True,
                 **kwargs,
             )
 
+        # NB specific to nm (Nelder-Mead) optimization.                                                                                                                                                                                                                         
+        niter = result.mle_retvals["iterations"]
+        runtime = time.time() - start
+        
+        logger.info(
+            f"Finished {self.__class__.__name__} optimization in {runtime:.2f} seconds, with {niter} iterations."
+        )
+            
         if write_chain:
             with open(tmp_path) as fin:
                 with gzip.open(final_path, "wt") as fout:
                     fout.write(f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n")
                     fout.write(
-                        f"#  start_type:{start_params_str},shape:{self.endog.shape[0]},"
+                        f"#  start_type:{start_params_str},runtime:{runtime},shape:{self.endog.shape[0]},"
                         + ",".join(
                             f"{key}:{value}" for key, value in result.mle_retvals.items()
                         )
@@ -190,14 +198,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True,
                         fout.write(line)
 
         os.remove(tmp_path)
-
-        # NB specific to nm (Nelder-Mead) optimization.
-        niter = result.mle_retvals["iterations"]
-
-        logger.info(
-            f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations."
-        )
-
+        
         return result
 
 

From 2218ebf33227b9ccf09b1de5a8e9c9121099419b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:11:35 -0400
Subject: [PATCH 106/125] fix

---
 src/calicost/utils_distribution_fitting.py | 75 +++++++++++-----------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 2e21437..9998f8f 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -46,6 +46,9 @@ def convert_params(mean, std):
 
 @contextlib.contextmanager
 def save_stdout(fpath):
+    """
+    Context manager to write stdout to fpath.
+    """
     original = sys.stdout
 
     with open(fpath, "w") as ff:
@@ -62,7 +65,7 @@ class WeightedModel(GenericLikelihoodModel, ABC):
     """
     An ABC for defined emission models.
 
-    Attributes                                                                                                                                                                                                                                                                 
+    Attributes
     ----------
     endog : array, (n_samples,)                                                                                                                                                                                                                                                         Y values.
     exog : array, (n_samples, n_features)
@@ -72,6 +75,7 @@ class WeightedModel(GenericLikelihoodModel, ABC):
     exposure : array, (n_samples,)
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
+
     def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
         super().__init__(endog, exog, **kwargs)
 
@@ -84,7 +88,7 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
         # NB Weight_BetaBinomial does not specify seed
         self.seed = seed
 
-        # NB __pos_init__ validates the expected tumor proportion and handles incrementing instance count.
+        # NB __post_init__ validates the expected tumor proportion and handles incrementing instance count.
         self.__post_init__()
 
         logger.info(
@@ -109,13 +113,6 @@ def get_ext_param_name(self):
         """
         pass
 
-    @abstractmethod
-    def get_ninstance(self):
-        """
-        Return the instance count for the given model
-        """
-        pass
-
     @abstractmethod
     def __post_init__(self):
         """
@@ -125,13 +122,26 @@ def __post_init__(self):
         """
         pass
 
+    def get_ninstance(self):
+        """
+        Return the instance count for the given model
+        """
+        return self.ninstance
+
     def __callback__(self, params):
         """
         Define callback for writing parameter chain to file.
         """
         print(f"{params} {self.nloglikeobs(params)};")
 
-    def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs):
+    def fit(
+        self,
+        start_params=None,
+        maxiter=10_000,
+        maxfun=5_000,
+        write_chain=True,
+        **kwargs,
+    ):
         ext_param_name = self.get_ext_param_name()
 
         self.exog_names.append(ext_param_name)
@@ -174,22 +184,25 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True,
                 **kwargs,
             )
 
-        # NB specific to nm (Nelder-Mead) optimization.                                                                                                                                                                                                                         
+        # NB specific to nm (Nelder-Mead) optimization.
         niter = result.mle_retvals["iterations"]
         runtime = time.time() - start
-        
+
         logger.info(
             f"Finished {self.__class__.__name__} optimization in {runtime:.2f} seconds, with {niter} iterations."
         )
-            
+
         if write_chain:
             with open(tmp_path) as fin:
                 with gzip.open(final_path, "wt") as fout:
-                    fout.write(f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n")
+                    fout.write(
+                        f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n"
+                    )
                     fout.write(
                         f"#  start_type:{start_params_str},runtime:{runtime},shape:{self.endog.shape[0]},"
                         + ",".join(
-                            f"{key}:{value}" for key, value in result.mle_retvals.items()
+                            f"{key}:{value}"
+                            for key, value in result.mle_retvals.items()
                         )
                         + "\n"
                     )
@@ -198,7 +211,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True,
                         fout.write(line)
 
         os.remove(tmp_path)
-        
+
         return result
 
 
@@ -210,7 +223,6 @@ class Weighted_NegativeBinomial(WeightedModel):
 
     max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
-
     ninstance = 0
 
     def nloglikeobs(self, params):
@@ -228,11 +240,10 @@ def get_ext_param_name():
         return "alpha"
 
     def __post_init__(self):
+        assert self.tumor_prop is None
+        
         Weighted_NegativeBinomial.ninstance += 1
 
-    def get_ninstance(self):
-        return self.ninstance
-
 
 class Weighted_NegativeBinomial_mix(WeightedModel):
     ninstance = 0
@@ -258,9 +269,6 @@ def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
         Weighted_NegativeBinomial_mix.ninstance
-        
-    def get_ninstance(self):
-        return self.ninstance
 
 
 class Weighted_BetaBinom(WeightedModel):
@@ -271,6 +279,7 @@ class Weighted_BetaBinom(WeightedModel):
 
     max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
     """
+
     ninstance = 0
 
     def nloglikeobs(self, params):
@@ -288,11 +297,10 @@ def get_ext_param_name(self):
         return "tau"
 
     def __post_init__(self):
+        assert self.tumor_prop is None
+        
         Weighted_BetaBinom.ninstance += 1
 
-    def get_ninstance(self):
-        return self.ninstance
-
 
 class Weighted_BetaBinom_mix(WeightedModel):
     ninstance = 0
@@ -320,10 +328,7 @@ def get_ext_param_name():
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
-        Weighted_BetaBinom_mix.ninstance +=	1
-        
-    def get_ninstance(self):
-        return self.ninstance
+        Weighted_BetaBinom_mix.ninstance += 1
 
 
 class Weighted_BetaBinom_fixdispersion(WeightedModel):
@@ -361,10 +366,9 @@ def get_default_start_params(self):
         return 0.1 * np.ones(self.nparams)
 
     def __post_init__(self):
-        Weighted_BetaBinom_fixdispersion.ninstance += 1 
-
-    def get_ninstance(self):
-        return self.ninstance
+        assert self.tumor_prop is None
+        
+        Weighted_BetaBinom_fixdispersion.ninstance += 1
 
 
 class Weighted_BetaBinom_fixdispersion_mix(WeightedModel):
@@ -408,6 +412,3 @@ def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
         Weighted_BetaBinom_fixdispersion_mix.ninstance += 1
-        
-    def get_ninstance(self):
-        return self.ninstance

From 37eee031d2e2eb4a58161385ee91fd00142de77b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:13:06 -0400
Subject: [PATCH 107/125] fix

---
 src/calicost/utils_distribution_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 9998f8f..0345e4a 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -199,7 +199,7 @@ def fit(
                         f"#  {self.__class__.__name__} {ninst} @ {time.asctime()}\n"
                     )
                     fout.write(
-                        f"#  start_type:{start_params_str},runtime:{runtime},shape:{self.endog.shape[0]},"
+                        f"#  start_type:{start_params_str},runtime:{runtime:.6f},shape:{self.endog.shape[0]},"
                         + ",".join(
                             f"{key}:{value}"
                             for key, value in result.mle_retvals.items()

From 9591ccc720a1bc307a81a61b4ab8fcdc15a1d231 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:30:45 -0400
Subject: [PATCH 108/125] cleanup before adding ARI for HMM states.

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 53 ++++++++++++++++----------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index eb6bfd5..7f5c62a 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -78,6 +78,7 @@ def compute_emission_probability_nb_betabinom(
         n_comp = X.shape[1]
         n_spots = X.shape[2]
         n_states = log_mu.shape[0]
+
         # initialize log_emission
         log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
         log_emission_baf = np.zeros((n_states, n_obs, n_spots))
@@ -335,48 +336,53 @@ def run_baum_welch_nb_bb(
             log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
             alpha: size of n_states. Dispersioon parameter of each HMM state.
         """
-        n_obs = X.shape[0]
-        n_comp = X.shape[1]
-        n_spots = X.shape[2]
-        
+        n_obs, n_comp, n_spots  = X.shape
+
         assert n_comp == 2
-        
+
         log_mu = (
             np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T
             if init_log_mu is None
             else init_log_mu
         )
-        
+
         p_binom = (
             np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T
             if init_p_binom is None
             else init_p_binom
         )
-        
+
         # NB initialize (inverse of) dispersion param in NB and BetaBinom
         alphas = (
             0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
         )
-        
+
         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
 
-        use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) 
+        use_defaults = (
+            (init_log_mu is None)
+            and (init_p_binom is None)
+            and (init_alphas is None)
+            and (init_taus is None)
+        )
 
         logger.info(f"Initial alphas:\n{alphas}")
         logger.info(f"Initial taus:\n{taus}")
-        
-        logger.info(f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).")
-        
+
+        logger.info(
+            f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults})."
+        )
+
         # NB initialize start probability and emission probability
         log_startprob = np.log(np.ones(n_states) / n_states)
-        
+
         if n_states > 1:
             transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1)
             np.fill_diagonal(transmat, self.t)
             log_transmat = np.log(transmat)
         else:
             log_transmat = np.zeros((1, 1))
-            
+
         log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None
 
         # NB a trick to speed up BetaBinom optimization: taking only unique
@@ -398,11 +404,11 @@ def run_baum_welch_nb_bb(
                         X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus
                     )
                 )
-                log_emission = log_emission_rdr + log_emission_baf
             else:
                 # compute mu as adjusted RDR
                 if ((not log_gamma is None) or (r > 0)) and ("m" in self.params):
                     logmu_shift = []
+
                     for c in range(len(kwargs["sample_length"])):
                         this_pred_cnv = (
                             np.argmax(
@@ -451,7 +457,8 @@ def run_baum_welch_nb_bb(
                             tumor_prop,
                         )
                     )
-                log_emission = log_emission_rdr + log_emission_baf
+
+            log_emission = log_emission_rdr + log_emission_baf
 
             log_alpha = hmm_nophasing_v2.forward_lattice(
                 lengths,
@@ -482,10 +489,12 @@ def run_baum_welch_nb_bb(
                 new_log_startprob = new_log_startprob.flatten()
             else:
                 new_log_startprob = log_startprob
+
             if "t" in self.params:
                 new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag)
             else:
                 new_log_transmat = log_transmat
+
             if "m" in self.params:
                 if tumor_prop is None:
                     new_log_mu, new_alphas = (
@@ -515,6 +524,7 @@ def run_baum_welch_nb_bb(
             else:
                 new_log_mu = log_mu
                 new_alphas = alphas
+
             if "p" in self.params:
                 if tumor_prop is None:
                     new_p_binom, new_taus = (
@@ -532,6 +542,7 @@ def run_baum_welch_nb_bb(
                     # compute mu as adjusted RDR
                     if "m" in self.params:
                         mu = []
+                        
                         for c in range(len(kwargs["sample_length"])):
                             this_pred_cnv = (
                                 np.argmax(
@@ -560,6 +571,7 @@ def run_baum_welch_nb_bb(
                         )
                     else:
                         weighted_tp = tumor_prop
+
                     new_p_binom, new_taus = (
                         update_emission_params_bb_nophasing_uniqvalues_mix(
                             unique_values_bb,
@@ -577,16 +589,17 @@ def run_baum_welch_nb_bb(
                 new_taus = taus
 
             logger.info(
-                f"EM convergence metrics (v2): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}"
+                f"EM convergence metrics (v2): startprob={np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, transmat={np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))},"
+                +"mu={np.mean(np.abs(new_log_mu - log_mu))}, pbinom={np.mean(np.abs(new_p_binom - p_binom))}"
             )
-            
+
             if (
                 np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol
                 and np.mean(np.abs(new_log_mu - log_mu)) < tol
                 and np.mean(np.abs(new_p_binom - p_binom)) < tol
             ):
                 break
-            
+
             log_startprob = new_log_startprob
             log_transmat = new_log_transmat
             log_mu = new_log_mu
@@ -598,7 +611,7 @@ def run_baum_welch_nb_bb(
 
         logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}")
         logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}")
-        
+
         return (
             new_log_mu,
             new_alphas,

From af2fa1ae5b76b7df1fc0a48b548581080b57ea0b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:32:26 -0400
Subject: [PATCH 109/125] precision on likelihood chain

---
 src/calicost/utils_distribution_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 0345e4a..09e5266 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -132,7 +132,7 @@ def __callback__(self, params):
         """
         Define callback for writing parameter chain to file.
         """
-        print(f"{params} {self.nloglikeobs(params)};")
+        print(f"{params} {self.nloglikeobs(params):.6f};")
 
     def fit(
         self,

From 8416be5afa0b8014866c62750b972d3b6ac8c06c Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:38:13 -0400
Subject: [PATCH 110/125] fix

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 7f5c62a..71af39d 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -13,6 +13,7 @@
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmm import *
 import networkx as nx
+from sklearn.metrics import adjusted_rand_score
 
 logger = logging.getLogger(__name__)
 
@@ -383,6 +384,7 @@ def run_baum_welch_nb_bb(
         else:
             log_transmat = np.zeros((1, 1))
 
+        # NB gamma[i,t] = P(q_t = i | O, lambda), n_states * n_observations;
         log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None
 
         # NB a trick to speed up BetaBinom optimization: taking only unique
@@ -476,12 +478,21 @@ def run_baum_welch_nb_bb(
                 log_sitewise_transmat,
             )
 
-            log_gamma = compute_posterior_obs(log_alpha, log_beta)
-
             log_xi = compute_posterior_transition_nophasing(
                 log_alpha, log_beta, log_transmat, log_emission
             )
+            
+            log_gamma = compute_posterior_obs(log_alpha, log_beta)
+            
+            pred_states = np.argmax(log_gamma, axis=0)
+
+            if last_pred_states is None:
+                last_pred_states = pred_states
+            
+            ari = {adjusted_rand_score(last_pred_states, pred_states)}
 
+            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+            
             logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.")
 
             if "s" in self.params:

From 9882e5e0d227efbecde70a0c772934a6086aad0d Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:41:16 -0400
Subject: [PATCH 111/125] add HMM ARI

---
 src/calicost/hmm_NB_BB_nophasing_v2.py |  4 +++-
 src/calicost/hmm_NB_BB_phaseswitch.py  | 16 ++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 71af39d..ebae623 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -395,7 +395,9 @@ def run_baum_welch_nb_bb(
         unique_values_bb, mapping_matrices_bb = construct_unique_matrix(
             X[:, 1, :], total_bb_RD
         )
-
+        
+        last_pred_states = None
+        
         for r in range(max_iter):
             logger.info("-" * 250)
             logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.")
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index b32191b..8d7b12e 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -13,6 +13,7 @@
 from calicost.utils_distribution_fitting import *
 from calicost.hmm_NB_BB_nophasing import *
 from calicost.hmm_NB_BB_nophasing_v2 import *
+from sklearn.metrics import adjusted_rand_score
 import networkx as nx
 
 logger = logging.getLogger(__name__)
@@ -439,6 +440,8 @@ def run_baum_welch_nb_bb(
             X[:, 1, :], total_bb_RD
         )
 
+        last_pred_states = None
+
         for r in range(max_iter):
             logger.info("-" * 250)
             logger.info(
@@ -483,12 +486,21 @@ def run_baum_welch_nb_bb(
                 log_sitewise_transmat,
             )
 
-            log_gamma = compute_posterior_obs(log_alpha, log_beta)
-
             log_xi = compute_posterior_transition_sitewise(
                 log_alpha, log_beta, log_transmat, log_emission
             )
+            
+            log_gamma = compute_posterior_obs(log_alpha, log_beta)
 
+            pred_states = np.argmax(log_gamma, axis=0)
+
+            if last_pred_states is None:
+	        last_pred_states = pred_states
+
+            ari = {adjusted_rand_score(last_pred_states, pred_states)}
+
+            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+            
             logger.info(
                 f"Calculating M-step (sitewise) for iteration {r} of {max_iter}."
             )

From 0febc15c03086df99a44c787868eef52c69f443e Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:43:42 -0400
Subject: [PATCH 112/125] fix

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +-
 src/calicost/hmm_NB_BB_phaseswitch.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index ebae623..58271ca 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -493,7 +493,7 @@ def run_baum_welch_nb_bb(
             
             ari = {adjusted_rand_score(last_pred_states, pred_states)}
 
-            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).")
             
             logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.")
 
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 8d7b12e..eef99a1 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -495,11 +495,11 @@ def run_baum_welch_nb_bb(
             pred_states = np.argmax(log_gamma, axis=0)
 
             if last_pred_states is None:
-	        last_pred_states = pred_states
+                last_pred_states = pred_states
 
             ari = {adjusted_rand_score(last_pred_states, pred_states)}
 
-            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).")
             
             logger.info(
                 f"Calculating M-step (sitewise) for iteration {r} of {max_iter}."

From e0c37128cf3e8b2f24a28f60498524fa9cea1b02 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:44:45 -0400
Subject: [PATCH 113/125] fix

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +-
 src/calicost/hmm_NB_BB_phaseswitch.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 58271ca..d946a16 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -491,7 +491,7 @@ def run_baum_welch_nb_bb(
             if last_pred_states is None:
                 last_pred_states = pred_states
             
-            ari = {adjusted_rand_score(last_pred_states, pred_states)}
+            ari = adjusted_rand_score(last_pred_states, pred_states)
 
             logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).")
             
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index eef99a1..9d2c6be 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -497,7 +497,7 @@ def run_baum_welch_nb_bb(
             if last_pred_states is None:
                 last_pred_states = pred_states
 
-            ari = {adjusted_rand_score(last_pred_states, pred_states)}
+            ari = adjusted_rand_score(last_pred_states, pred_states)
 
             logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).")
             

From ec676e6c445c0ad38bce2361ea38539cd6b2a205 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:49:45 -0400
Subject: [PATCH 114/125] fix

---
 src/calicost/hmm_NB_BB_nophasing_v2.py | 10 +++++-----
 src/calicost/hmm_NB_BB_phaseswitch.py  |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index d946a16..2fdaf29 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -488,12 +488,12 @@ def run_baum_welch_nb_bb(
             
             pred_states = np.argmax(log_gamma, axis=0)
 
-            if last_pred_states is None:
-                last_pred_states = pred_states
-            
-            ari = adjusted_rand_score(last_pred_states, pred_states)
+            if last_pred_states is not None:
+                ari = adjusted_rand_score(last_pred_states, pred_states)
+
+                logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
 
-            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).")
+            last_pred_states = pred_states                
             
             logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.")
 
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 9d2c6be..51324da 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -494,12 +494,12 @@ def run_baum_welch_nb_bb(
 
             pred_states = np.argmax(log_gamma, axis=0)
 
-            if last_pred_states is None:
-                last_pred_states = pred_states
+            if last_pred_states is not None:
+		ari = adjusted_rand_score(last_pred_states, pred_states)
 
-            ari = adjusted_rand_score(last_pred_states, pred_states)
+                logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
 
-            logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).")
+            last_pred_states = pred_states
             
             logger.info(
                 f"Calculating M-step (sitewise) for iteration {r} of {max_iter}."

From 1b56487d85afd3837c3d2349f9f257aa7f222476 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:50:40 -0400
Subject: [PATCH 115/125] fix

---
 src/calicost/hmm_NB_BB_phaseswitch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 51324da..d0ede40 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -495,10 +495,10 @@ def run_baum_welch_nb_bb(
             pred_states = np.argmax(log_gamma, axis=0)
 
             if last_pred_states is not None:
-		ari = adjusted_rand_score(last_pred_states, pred_states)
+                ari = adjusted_rand_score(last_pred_states, pred_states)
 
                 logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
-
+                
             last_pred_states = pred_states
             
             logger.info(

From 2977647bdf8ccca90c26b1def421415b93e39733 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 11:53:23 -0400
Subject: [PATCH 116/125] fix

---
 src/calicost/hmm_NB_BB_nophasing.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index 0a436b9..e88ab7b 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -9,6 +9,7 @@
 from tqdm import trange
 import statsmodels.api as sm
 from statsmodels.base.model import GenericLikelihoodModel
+from sklearn.metrics import adjusted_rand_score
 import copy
 from calicost.utils_distribution_fitting import *
 from calicost.utils_hmm import *
@@ -363,6 +364,8 @@ def run_baum_welch_nb_bb(
             X[:, 1, :], total_bb_RD
         )
 
+        last_pred_states = None
+        
         for r in range(max_iter):
             logger.info(f"Calculating E-step for iteration {r} of {max_iter}.")
 
@@ -404,12 +407,21 @@ def run_baum_welch_nb_bb(
                 log_sitewise_transmat,
             )
 
-            log_gamma = compute_posterior_obs(log_alpha, log_beta)
-
             log_xi = compute_posterior_transition_nophasing(
                 log_alpha, log_beta, log_transmat, log_emission
             )
             
+            log_gamma = compute_posterior_obs(log_alpha, log_beta)
+
+            pred_states = np.argmax(log_gamma, axis=0)
+
+            if last_pred_states is not None:
+                ari = adjusted_rand_score(last_pred_states, pred_states)
+
+                logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+
+            last_pred_states = pred_states
+            
             logger.info(f"Calculating M-step for iteration {r} of {max_iter}.")
 
             if "s" in self.params:

From ea01b9ef349a7bcf9851ac98ac8a1c0a954da3f7 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 12:01:58 -0400
Subject: [PATCH 117/125] make parent dirs for chains

---
 src/calicost/utils_distribution_fitting.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 09e5266..1eaea2e 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -7,6 +7,7 @@
 import sys
 import time
 from abc import ABC, abstractmethod
+from pathlib import Path
 
 import numpy as np
 import scipy
@@ -167,9 +168,13 @@ def fit(
 
         tmp_path = f"{self.__class__.__name__.lower()}_chain.tmp"
 
-        # TODO mkdir chains
         ninst = self.get_ninstance()
-        final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt.gzip"
+
+        # TODO mkdir chains 
+        class_name = self.__class__.__name__.lower()
+        final_path = f"chains/{class_name}/{class_name}_chain_{ninst}.txt.gzip"
+
+        Path(final_path).mkdir(parents=True, exist_ok=True)
 
         with save_stdout(tmp_path):
             result = super().fit(

From 890d4a7f63330a27fe1e20bbd8a48605449a0c6b Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 12:05:09 -0400
Subject: [PATCH 118/125] fix

---
 src/calicost/utils_distribution_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 1eaea2e..181cc2c 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -174,7 +174,7 @@ def fit(
         class_name = self.__class__.__name__.lower()
         final_path = f"chains/{class_name}/{class_name}_chain_{ninst}.txt.gzip"
 
-        Path(final_path).mkdir(parents=True, exist_ok=True)
+        Path(final_path).parent.mkdir(parents=True, exist_ok=True)
 
         with save_stdout(tmp_path):
             result = super().fit(

From 4a1039c759b2e9daa9bd11380ce388e19703c570 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 12:11:10 -0400
Subject: [PATCH 119/125] add hamming distances

---
 src/calicost/hmm_NB_BB_nophasing.py    | 3 ++-
 src/calicost/hmm_NB_BB_nophasing_v2.py | 5 +++--
 src/calicost/hmm_NB_BB_phaseswitch.py  | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index e88ab7b..541acca 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -417,8 +417,9 @@ def run_baum_welch_nb_bb(
 
             if last_pred_states is not None:
                 ari = adjusted_rand_score(last_pred_states, pred_states)
+                hamm = sum(last_pred_states != pred_states)
 
-                logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.")
 
             last_pred_states = pred_states
             
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 2fdaf29..9a1b9be 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -490,8 +490,9 @@ def run_baum_welch_nb_bb(
 
             if last_pred_states is not None:
                 ari = adjusted_rand_score(last_pred_states, pred_states)
-
-                logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+                hamm = sum(last_pred_states != pred_states)
+                
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.")
 
             last_pred_states = pred_states                
             
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index d0ede40..1e4c34e 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -496,8 +496,9 @@ def run_baum_welch_nb_bb(
 
             if last_pred_states is not None:
                 ari = adjusted_rand_score(last_pred_states, pred_states)
+                hamm = sum(last_pred_states != pred_states)
 
-                logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.")
                 
             last_pred_states = pred_states
             

From ec8881b3e86f10844f3afa90d7b0e6b94d264edc Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 13:53:57 -0400
Subject: [PATCH 120/125] fixes

---
 src/calicost/utils_distribution_fitting.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 181cc2c..52b4f5c 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -77,11 +77,11 @@ class WeightedModel(GenericLikelihoodModel, ABC):
         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
     """
 
-    def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs):
+    def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwargs):
         super().__init__(endog, exog, **kwargs)
 
         # NB unpack a single additional positional argument as tumor_proportion.
-        self.tumor_prop = args if len(args) == 1 else None
+        self.tumor_prop = tumor_prop
 
         self.weights = weights
         self.exposure = exposure
@@ -241,7 +241,7 @@ def nloglikeobs(self, params):
     def get_default_start_params(self):
         return np.append(0.1 * np.ones(self.exog.shape[1]), 0.01)
 
-    def get_ext_param_name():
+    def get_ext_param_name(self):
         return "alpha"
 
     def __post_init__(self):
@@ -327,7 +327,7 @@ def nloglikeobs(self, params):
     def get_default_start_params(self):
         return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
 
-    def get_ext_param_name():
+    def get_ext_param_name(self):
         return "tau"
 
     def __post_init__(self):

From d883c3497e2e93375651a3b62d1b468636be2dd4 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 13:57:17 -0400
Subject: [PATCH 121/125] add # of states for hamming.

---
 src/calicost/hmm_NB_BB_nophasing.py    | 2 +-
 src/calicost/hmm_NB_BB_nophasing_v2.py | 4 ++--
 src/calicost/hmm_NB_BB_phaseswitch.py  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index 541acca..ce93881 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -419,7 +419,7 @@ def run_baum_welch_nb_bb(
                 ari = adjusted_rand_score(last_pred_states, pred_states)
                 hamm = sum(last_pred_states != pred_states)
 
-                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.")
 
             last_pred_states = pred_states
             
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index 9a1b9be..a16b8dc 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -492,7 +492,7 @@ def run_baum_welch_nb_bb(
                 ari = adjusted_rand_score(last_pred_states, pred_states)
                 hamm = sum(last_pred_states != pred_states)
                 
-                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.")
 
             last_pred_states = pred_states                
             
@@ -604,7 +604,7 @@ def run_baum_welch_nb_bb(
 
             logger.info(
                 f"EM convergence metrics (v2): startprob={np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, transmat={np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))},"
-                +"mu={np.mean(np.abs(new_log_mu - log_mu))}, pbinom={np.mean(np.abs(new_p_binom - p_binom))}"
+              + f"mu={np.mean(np.abs(new_log_mu - log_mu))}, pbinom={np.mean(np.abs(new_p_binom - p_binom))}"
             )
 
             if (
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index 1e4c34e..d38cbb9 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -498,7 +498,7 @@ def run_baum_welch_nb_bb(
                 ari = adjusted_rand_score(last_pred_states, pred_states)
                 hamm = sum(last_pred_states != pred_states)
 
-                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.")
                 
             last_pred_states = pred_states
             

From 6f66d31e4dfbf2d1aaed3b00710092b83eaf5fe4 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 14:12:49 -0400
Subject: [PATCH 122/125] fix

---
 src/calicost/hmm_NB_BB_nophasing.py        | 2 +-
 src/calicost/hmm_NB_BB_nophasing_v2.py     | 2 +-
 src/calicost/hmm_NB_BB_phaseswitch.py      | 2 +-
 src/calicost/utils_distribution_fitting.py | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py
index ce93881..0b182d1 100644
--- a/src/calicost/hmm_NB_BB_nophasing.py
+++ b/src/calicost/hmm_NB_BB_nophasing.py
@@ -419,7 +419,7 @@ def run_baum_welch_nb_bb(
                 ari = adjusted_rand_score(last_pred_states, pred_states)
                 hamm = sum(last_pred_states != pred_states)
 
-                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(last_pred_states)} states.")
 
             last_pred_states = pred_states
             
diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py
index a16b8dc..241e5da 100644
--- a/src/calicost/hmm_NB_BB_nophasing_v2.py
+++ b/src/calicost/hmm_NB_BB_nophasing_v2.py
@@ -492,7 +492,7 @@ def run_baum_welch_nb_bb(
                 ari = adjusted_rand_score(last_pred_states, pred_states)
                 hamm = sum(last_pred_states != pred_states)
                 
-                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(last_pred_states)} states.")
 
             last_pred_states = pred_states                
             
diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py
index d38cbb9..f1e25a0 100644
--- a/src/calicost/hmm_NB_BB_phaseswitch.py
+++ b/src/calicost/hmm_NB_BB_phaseswitch.py
@@ -498,7 +498,7 @@ def run_baum_welch_nb_bb(
                 ari = adjusted_rand_score(last_pred_states, pred_states)
                 hamm = sum(last_pred_states != pred_states)
 
-                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.")
+                logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(last_pred_states)} states.")
                 
             last_pred_states = pred_states
             
diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index 52b4f5c..ccc07f0 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -194,7 +194,7 @@ def fit(
         runtime = time.time() - start
 
         logger.info(
-            f"Finished {self.__class__.__name__} optimization in {runtime:.2f} seconds, with {niter} iterations."
+            f"{self.__class__.__name__} optimization in {runtime:.2f}s, with {niter} iterations.  Best-fit: {result.params}"
         )
 
         if write_chain:
@@ -216,7 +216,7 @@ def fit(
                         fout.write(line)
 
         os.remove(tmp_path)
-
+        
         return result
 
 

From 28b0b817a5f2ece6f933dfabccf89ed968200e52 Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 15:22:25 -0400
Subject: [PATCH 123/125] fix increment bug.

---
 src/calicost/utils_distribution_fitting.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index ccc07f0..ade736c 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -80,7 +80,6 @@ class WeightedModel(GenericLikelihoodModel, ABC):
     def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwargs):
         super().__init__(endog, exog, **kwargs)
 
-        # NB unpack a single additional positional argument as tumor_proportion.
         self.tumor_prop = tumor_prop
 
         self.weights = weights
@@ -273,7 +272,7 @@ def get_ext_param_name(self):
     def __post_init__(self):
         assert self.tumor_prop is not None, "Tumor proportion must be defined."
 
-        Weighted_NegativeBinomial_mix.ninstance
+        Weighted_NegativeBinomial_mix.ninstance += 1
 
 
 class Weighted_BetaBinom(WeightedModel):

From 3463cc33852e3fb825b2c92bcbc85787f9c3858f Mon Sep 17 00:00:00 2001
From: "Michael J. Wilson" <mw9568@princeton.edu>
Date: Mon, 26 Aug 2024 15:25:47 -0400
Subject: [PATCH 124/125] fix

---
 src/calicost/utils_distribution_fitting.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py
index ade736c..8e0cf4f 100644
--- a/src/calicost/utils_distribution_fitting.py
+++ b/src/calicost/utils_distribution_fitting.py
@@ -339,11 +339,10 @@ class Weighted_BetaBinom_fixdispersion(WeightedModel):
     ninstance = 0
 
     # NB custom __init__ required to handle tau.
-    def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs):
+    def __init__(self, endog, exog, tau, weights, exposure, *args, tumor_prop=None, seed=0, **kwargs):
         super().__init__(endog, exog, **kwargs)
 
-        # NB unpack a single additional positional argument as tumor_proportion.
-        self.tumor_prop = args if len(args) == 1 else None
+        self.tumor_prop = tumor_prop
 
         self.tau = tau
         self.weights = weights
@@ -377,11 +376,10 @@ def __post_init__(self):
 
 class Weighted_BetaBinom_fixdispersion_mix(WeightedModel):
     # NB custom __init__ required to handle tau.
-    def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs):
+    def __init__(self, endog, exog, tau, weights, exposure, *args, tumor_prop=None, seed=0, **kwargs):
         super().__init__(endog, exog, **kwargs)
 
-        # NB unpack a single additional positional argument as tumor_proportion.
-        self.tumor_prop = args if len(args) == 1 else None
+        self.tumor_prop = tumor_prop
 
         self.tau = tau
         self.weights = weights

From 5c95c473e321bff85290c46a449dc040b123b1dc Mon Sep 17 00:00:00 2001
From: Cong Ma <congma@gl-login2.arc-ts.umich.edu>
Date: Mon, 2 Sep 2024 10:11:59 -0400
Subject: [PATCH 125/125] remove examples