From 241208c03054b1db1b1af94fed155cb2c690a89a Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 8 Aug 2024 11:36:33 -0400 Subject: [PATCH 001/125] main script with more comments --- src/calicost/calicost_main.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 8990bc8..0b0add0 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -35,9 +35,18 @@ def main(configuration_file): for k in sorted(list(config.keys())): print(f"\t{k} : {config[k]}") + # Assuming the B counts are calculated by the cellsnp-lite and Eagle pipeline + # If assuming each spot contains a mixture of normal/tumor cells, the tumor proportion should be provided in the config file. + # load data + ## If the data is loaded for the first time: infer phasing using phase-switch HMM (hmm_NB_BB_phaseswitch.py and phasing.py) -> output initial_phase.npz, matrices in parsed_inputs folder + ## If the data is already loaded: load the matrices from parsed_inputs folder lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) + """ + Initial clustering spots using only BAF values. + """ + # setting transcript count to 0, and baseline so that emission probability calculation will ignore them. copy_single_X_rdr = copy.copy(single_X[:,0,:]) copy_single_base_nb_mean = copy.copy(single_base_nb_mean) single_X[:,0,:] = 0 @@ -64,6 +73,8 @@ def main(configuration_file): np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres) # run HMRF + HMM + # store the results of each iteration of HMRF in a npz file outdir/prefix_nstates{config['n_states']}_sp.npz + # if a specific iteration is computed, hmrf will directly load the results from the file if config["tumorprop_file"] is None: hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=config["n_states"], \ log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \ @@ -88,6 +99,8 @@ def main(configuration_file): else: X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))], single_tumor_prop, threshold=config["tumorprop_threshold"]) tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) + # merge "similar" clones from the initial number of clones. + # "similar" defined by Neyman Pearson statistics/ Likelihood ratios P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B) merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="sp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2) print(f"BAF clone merging after comparing similarity: {merging_groups}") # @@ -99,7 +112,7 @@ def main(configuration_file): n_baf_clones = len(merging_groups) np.savez(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res) - # adjust phasing + # load merged results n_obs = single_X.shape[0] merged_res = dict(np.load(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", allow_pickle=True)) merged_baf_assignment = copy.copy(merged_res["new_assignment"]) @@ -109,8 +122,12 @@ def main(configuration_file): merged_baf_profiles = np.array([ np.where(pred[c,:] < config["n_states"], merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0], 1-merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0]) \ for c in range(n_baf_clones) ]) + """ + Refined clustering using BAF and RDR values. + """ # adding RDR information if not config["bafonly"]: + # Only used when assuming each spot is pure normal or tumor and if we don't know which spots are normal spots. # select normal spots if (config["normalidx_file"] is None) and (config["tumorprop_file"] is None): EPS_BAF = 0.05 @@ -129,19 +146,24 @@ def main(configuration_file): # single_base_nb_mean has already been added in loading data step. if not config["tumorprop_file"] is None: logger.warning(f"Mixed sources of information for normal spots! Using {config['normalidx_file']}") + + # If tumor purity is provided, we can use it to select normal spots. else: for prop_threshold in np.arange(0.05, 0.6, 0.05): normal_candidate = (single_tumor_prop < prop_threshold) if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200: break - # filter bins based on normal + # To avoid allele-specific expression that are not relevant to CNA, filter bins where normal pseudobulk has large |BAF - 0.5| index_normal = np.where(normal_candidate)[0] lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file']) assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0] df_bininfo = genesnp_to_bininfo(df_gene_snp) copy_single_X_rdr = copy.copy(single_X[:,0,:]) - # filter out high-UMI DE genes, which may bias RDR estimates + + # If a gene has way higher expression than adjacent genes, its transcript count will dominate RDR values + # To avoid the domination, filter out high-UMI DE genes, which may bias RDR estimates + # Assume the remaining genes will still carry the CNA info. copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=sample_list, sample_ids=sample_ids) MIN_NORMAL_COUNT_PERBIN = 20 bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0] @@ -166,6 +188,7 @@ def main(configuration_file): if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: # put a minimum B allele read count on pseudobulk to split clones continue # initialize clone + # write the initialization in a npz file outdir/prefix_nstates{config['n_states']}_smp.npz if config["tumorprop_file"] is None: initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], config['n_clones_rdr'], random_state=r_hmrf_initialization) else: From 96e873fa36a54416d96d730789ad9a93db93bd5f Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 8 Aug 2024 13:41:13 -0400 Subject: [PATCH 002/125] add comment to hmrf main --- src/calicost/calicost_main.py | 1 + src/calicost/hmrf.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 0b0add0..8b144df 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -298,6 +298,7 @@ def main(configuration_file): log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) # final re-assignment across all clones using estimated RDR + BAF + # The following step may not be needed because of other improvements. And it may cause mistakes in some cases. if config["tumorprop_file"] is None: if config["nodepotential"] == "max": pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 185ca3a..e8e862f 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -27,11 +27,20 @@ ############################################################ def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): + """ + Choosing clones by Iterated Conditional Modes (Forward-backward version): + for which the emission probability is given by the posterior probability of all HMM states at each bin. + Input format assumption: the RDR/BAF vectors are not shared across clones <- after clone refinement with RDR+BAF signals. + + HMRF likelihood: node potential where each node is a spot. And edge potential. + Node potential: likelihood of the data given HMM states of each clone. + Edge potential: Potts model. + """ N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] n_states = res["new_p_binom"].shape[0] - single_llf = np.zeros((N, n_clones)) + single_llf = np.zeros((N, n_clones)) # node potential new_assignment = copy.copy(prev_assignment) # posterior = np.zeros((N, n_clones)) @@ -72,6 +81,12 @@ def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_R def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): + """ + Choosing clones by Iterated Conditional Modes (Viterbi version): + for which the emission probability of each spot is a single of HMM state sequence. + Input format assumption: the RDR/BAF vectors are not shared across clones <- after clone refinement with RDR+BAF signals. + + """ N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] @@ -116,6 +131,9 @@ def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, re def hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): + """ + Input format assumption: the RDR/BAF vector is shared across all clones <- using only BAF signals, or running for each initial clone + """ N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = np.max(prev_assignment) + 1 From 0cdbcc2e1ccc7d85947a48985d5d9ce8555519e0 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 8 Aug 2024 13:52:14 -0400 Subject: [PATCH 003/125] add versions in setup --- setup.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index 6941a67..c447610 100644 --- a/setup.py +++ b/setup.py @@ -12,20 +12,20 @@ long_description='CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data', url='https://github.com/raphael-group/CalicoST', install_requires=[ - 'numpy', - 'scipy', - 'pandas', - 'scikit-learn', - 'scanpy', - 'anndata', - 'numba', - 'tqdm', - 'statsmodels', - 'networkx', - 'matplotlib', - 'seaborn', - 'pysam', - 'ete3', + 'numpy=1.24.4', + 'scipy=1.11.3', + 'pandas=2.1.1', + 'scikit-learn=1.3.2', + 'scanpy=1.9.6', + 'anndata=0.10.3', + 'numba=0.60.0', + 'tqdm=4.66.1', + 'statsmodels=0.14.0', + 'networkx=3.2.1', + 'matplotlib=3.7.3', + 'seaborn=0.12.2', + 'pysam=0.22.1', + 'ete3=3.1.3', 'ipykernel' ], include_package_data=True From 0b66b45954c77b1b5149e6dbff4e5d822584d908 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Sun, 11 Aug 2024 08:10:22 -0400 Subject: [PATCH 004/125] black formatted codes --- src/calicost/__init__.py | 2 +- src/calicost/allele_starch_generateconfig.py | 220 +- src/calicost/arg_parse.py | 356 +- src/calicost/calicost_main.py | 1181 +++++-- src/calicost/calicost_supervised.py | 1243 +++++-- src/calicost/estimate_tumor_proportion.py | 252 +- src/calicost/find_integer_copynumber.py | 290 +- src/calicost/hmm_NB_BB_nophasing.py | 317 +- src/calicost/hmm_NB_BB_nophasing_v2.py | 396 ++- src/calicost/hmm_NB_BB_phaseswitch.py | 1145 +++++-- src/calicost/hmm_NB_sharedstates.py | 164 +- src/calicost/hmm_gaussian.py | 506 ++- src/calicost/hmrf.py | 1876 +++++++++-- src/calicost/hmrf_normalmixture.py | 1 - src/calicost/joint_allele_generateconfig.py | 229 +- src/calicost/oldcode.py | 938 ++++-- src/calicost/parse_input.py | 410 ++- src/calicost/phasing.py | 185 +- src/calicost/phylogeny_startle.py | 170 +- src/calicost/phylogeography.py | 106 +- src/calicost/simple_sctransform.py | 142 +- src/calicost/utils_IO.py | 1529 +++++++-- src/calicost/utils_distribution_fitting.py | 116 +- src/calicost/utils_hmm.py | 1689 +++++++--- src/calicost/utils_hmrf.py | 609 +++- src/calicost/utils_phase_switch.py | 282 +- src/calicost/utils_plotting.py | 3146 ++++++++++++++---- 27 files changed, 13506 insertions(+), 3994 deletions(-) diff --git a/src/calicost/__init__.py b/src/calicost/__init__.py index 4957a9c..992770f 100644 --- a/src/calicost/__init__.py +++ b/src/calicost/__init__.py @@ -1 +1 @@ -__version__ = 'v1.0.0' +__version__ = "v1.0.0" diff --git a/src/calicost/allele_starch_generateconfig.py b/src/calicost/allele_starch_generateconfig.py index 3444c14..6320216 100644 --- a/src/calicost/allele_starch_generateconfig.py +++ b/src/calicost/allele_starch_generateconfig.py @@ -19,117 +19,119 @@ def read_configuration_file(filename): ##### [Default settings] ##### config = { - "spaceranger_dir" : None, - "snp_dir" : None, - "output_dir" : None, + "spaceranger_dir": None, + "snp_dir": None, + "output_dir": None, # supporting files and preprocessing arguments - "hgtable_file" : None, - "normalidx_file" : None, - "tumorprop_file" : None, - "supervision_clone_file" : None, - "filtergenelist_file" : None, - "filterregion_file" : None, - "binsize" : 1, - "rdrbinsize" : 1, + "hgtable_file": None, + "normalidx_file": None, + "tumorprop_file": None, + "supervision_clone_file": None, + "filtergenelist_file": None, + "filterregion_file": None, + "binsize": 1, + "rdrbinsize": 1, # "secondbinning_min_umi" : 500, - "max_nbins" : 1200, - "avg_umi_perbinspot" : 1.5, - "bafonly" : True, + "max_nbins": 1200, + "avg_umi_perbinspot": 1.5, + "bafonly": True, # phase switch probability - "nu" : 1, - "logphase_shift" : 1, - "npart_phasing" : 2, + "nu": 1, + "logphase_shift": 1, + "npart_phasing": 2, # HMRF configurations - "n_clones" : None, - "n_clones_rdr" : 2, - "min_spots_per_clone" : 100, - "min_avgumi_per_clone" : 10, - "maxspots_pooling" : 7, - "tumorprop_threshold" : 0.5, - "max_iter_outer" : 20, - "nodepotential" : "max", # max or weighted_sum - "initialization_method" : "rectangle", # rectangle or datadrive - "num_hmrf_initialization_start" : 0, - "num_hmrf_initialization_end" : 10, - "spatial_weight" : 2.0, - "construct_adjacency_method" : "hexagon", - "construct_adjacency_w" : 1.0, + "n_clones": None, + "n_clones_rdr": 2, + "min_spots_per_clone": 100, + "min_avgumi_per_clone": 10, + "maxspots_pooling": 7, + "tumorprop_threshold": 0.5, + "max_iter_outer": 20, + "nodepotential": "max", # max or weighted_sum + "initialization_method": "rectangle", # rectangle or datadrive + "num_hmrf_initialization_start": 0, + "num_hmrf_initialization_end": 10, + "spatial_weight": 2.0, + "construct_adjacency_method": "hexagon", + "construct_adjacency_w": 1.0, # HMM configurations - "n_states" : None, - "params" : None, - "t" : None, - "t_phaseing" : 1-1e-4, - "fix_NB_dispersion" : False, - "shared_NB_dispersion" : True, - "fix_BB_dispersion" : False, - "shared_BB_dispersion" : True, - "max_iter" : 30, - "tol" : 1e-3, - "gmm_random_state" : 0, - "np_threshold" : 2.0, - "np_eventminlen" : 10 + "n_states": None, + "params": None, + "t": None, + "t_phaseing": 1 - 1e-4, + "fix_NB_dispersion": False, + "shared_NB_dispersion": True, + "fix_BB_dispersion": False, + "shared_BB_dispersion": True, + "max_iter": 30, + "tol": 1e-3, + "gmm_random_state": 0, + "np_threshold": 2.0, + "np_eventminlen": 10, } argument_type = { - "spaceranger_dir" : "str", - "snp_dir" : "str", - "output_dir" : "str", + "spaceranger_dir": "str", + "snp_dir": "str", + "output_dir": "str", # supporting files and preprocessing arguments - "hgtable_file" : "str", - "normalidx_file" : "str", - "tumorprop_file" : "str", - "supervision_clone_file" : "str", - "filtergenelist_file" : "str", - "filterregion_file" : "str", - "binsize" : "int", - "rdrbinsize" : "int", + "hgtable_file": "str", + "normalidx_file": "str", + "tumorprop_file": "str", + "supervision_clone_file": "str", + "filtergenelist_file": "str", + "filterregion_file": "str", + "binsize": "int", + "rdrbinsize": "int", # "secondbinning_min_umi" : "int", - "max_nbins" : "int", - "avg_umi_perbinspot" : "float", - "bafonly" : "bool", + "max_nbins": "int", + "avg_umi_perbinspot": "float", + "bafonly": "bool", # phase switch probability - "nu" : "float", - "logphase_shift" : "float", - "npart_phasing" : "int", + "nu": "float", + "logphase_shift": "float", + "npart_phasing": "int", # HMRF configurations - "n_clones" : "int", - "n_clones_rdr" : "int", - "min_spots_per_clone" : "int", - "min_avgumi_per_clone" : "int", - "maxspots_pooling" : "int", - "tumorprop_threshold" : "float", - "max_iter_outer" : "int", - "nodepotential" : "str", - "initialization_method" : "str", - "num_hmrf_initialization_start" : "int", - "num_hmrf_initialization_end" : "int", - "spatial_weight" : "float", - "construct_adjacency_method" : "str", - "construct_adjacency_w" : "float", + "n_clones": "int", + "n_clones_rdr": "int", + "min_spots_per_clone": "int", + "min_avgumi_per_clone": "int", + "maxspots_pooling": "int", + "tumorprop_threshold": "float", + "max_iter_outer": "int", + "nodepotential": "str", + "initialization_method": "str", + "num_hmrf_initialization_start": "int", + "num_hmrf_initialization_end": "int", + "spatial_weight": "float", + "construct_adjacency_method": "str", + "construct_adjacency_w": "float", # HMM configurations - "n_states" : "int", - "params" : "str", - "t" : "eval", - "t_phaseing" : "eval", - "fix_NB_dispersion" : "bool", - "shared_NB_dispersion" : "bool", - "fix_BB_dispersion" : "bool", - "shared_BB_dispersion" : "bool", - "max_iter" : "int", - "tol" : "float", - "gmm_random_state" : "int", - "np_threshold" : "float", - "np_eventminlen" : "int" + "n_states": "int", + "params": "str", + "t": "eval", + "t_phaseing": "eval", + "fix_NB_dispersion": "bool", + "shared_NB_dispersion": "bool", + "fix_BB_dispersion": "bool", + "shared_BB_dispersion": "bool", + "max_iter": "int", + "tol": "float", + "gmm_random_state": "int", + "np_threshold": "float", + "np_eventminlen": "int", } ##### [ read configuration file to update settings ] ##### - with open(filename, 'r') as fp: + with open(filename, "r") as fp: for line in fp: if line.strip() == "" or line[0] == "#": continue # strs = [x.replace(" ", "") for x in line.strip().split(":") if x != ""] strs = [x.strip() for x in line.strip().split(":") if x != ""] - assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" + assert ( + strs[0] in config.keys() + ), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" if strs[1].upper() == "NONE": config[strs[0]] = None elif argument_type[strs[0]] == "str": @@ -141,7 +143,7 @@ def read_configuration_file(filename): elif argument_type[strs[0]] == "eval": config[strs[0]] = eval(strs[1]) elif argument_type[strs[0]] == "bool": - config[strs[0]] = (strs[1].upper() == "TRUE") + config[strs[0]] = strs[1].upper() == "TRUE" elif argument_type[strs[0]] == "list_str": config[strs[0]] = strs[1].split(" ") # assertions @@ -153,10 +155,9 @@ def read_configuration_file(filename): def write_config_file(outputfilename, config): - list_argument_io = ["spaceranger_dir", - "snp_dir", - "output_dir"] - list_argument_sup = ["hgtable_file", + list_argument_io = ["spaceranger_dir", "snp_dir", "output_dir"] + list_argument_sup = [ + "hgtable_file", "normalidx_file", "tumorprop_file", "supervision_clone_file", @@ -167,11 +168,11 @@ def write_config_file(outputfilename, config): # "secondbinning_min_umi", "max_nbins", "avg_umi_perbinspot", - "bafonly"] - list_argument_phase = ["nu", - "logphase_shift", - "npart_phasing"] - list_argument_hmrf = ["n_clones", + "bafonly", + ] + list_argument_phase = ["nu", "logphase_shift", "npart_phasing"] + list_argument_hmrf = [ + "n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", @@ -180,12 +181,14 @@ def write_config_file(outputfilename, config): "max_iter_outer", "nodepotential", "initialization_method", - "num_hmrf_initialization_start", + "num_hmrf_initialization_start", "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", - "construct_adjacency_w"] - list_argument_hmm = ["n_states", + "construct_adjacency_w", + ] + list_argument_hmm = [ + "n_states", "params", "t", "t_phaseing", @@ -197,8 +200,9 @@ def write_config_file(outputfilename, config): "tol", "gmm_random_state", "np_threshold", - "np_eventminlen"] - with open(outputfilename, 'w') as fp: + "np_eventminlen", + ] + with open(outputfilename, "w") as fp: # for k in list_argument_io: fp.write(f"{k} : {config[k]}\n") @@ -232,10 +236,10 @@ def main(argv): config = read_configuration_file(template_configuration_file) for r in range(hmrf_seed_s, hmrf_seed_t): config["num_hmrf_initialization_start"] = r - config["num_hmrf_initialization_end"] = r+1 + config["num_hmrf_initialization_end"] = r + 1 write_config_file(f"{outputdir}/configfile{r}", config) - + if __name__ == "__main__": if len(sys.argv) > 1: - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py index 32f5570..8bf796a 100644 --- a/src/calicost/arg_parse.py +++ b/src/calicost/arg_parse.py @@ -3,149 +3,213 @@ import scipy import pandas as pd import logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) logger = logging.getLogger() def load_default_config(): - config_joint = { - "input_filelist" : None, - "alignment_files" : [] - } - config_single = { - "spaceranger_dir" : None - } + config_joint = {"input_filelist": None, "alignment_files": []} + config_single = {"spaceranger_dir": None} config_shared = { - "snp_dir" : None, - "output_dir" : None, + "snp_dir": None, + "output_dir": None, # supporting files and preprocessing arguments - "geneticmap_file" : None, - "hgtable_file" : None, - "normalidx_file" : None, - "tumorprop_file" : None, - "supervision_clone_file" : None, - "filtergenelist_file" : None, - "filterregion_file" : None, - "secondary_min_umi" : 300, - "min_snpumi_perspot" : 50, - 'min_percent_expressed_spots' : 0.005, - "bafonly" : False, + "geneticmap_file": None, + "hgtable_file": None, + "normalidx_file": None, + "tumorprop_file": None, + "supervision_clone_file": None, + "filtergenelist_file": None, + "filterregion_file": None, + "secondary_min_umi": 300, + "min_snpumi_perspot": 50, + "min_percent_expressed_spots": 0.005, + "bafonly": False, # phase switch probability - "nu" : 1.0, - "logphase_shift" : -2.0, - "npart_phasing" : 3, + "nu": 1.0, + "logphase_shift": -2.0, + "npart_phasing": 3, # HMRF configurations - "n_clones" : None, - "n_clones_rdr" : 2, - "min_spots_per_clone" : 100, - "min_avgumi_per_clone" : 10, - "maxspots_pooling" : 7, - "tumorprop_threshold" : 0.5, - "max_iter_outer" : 20, - "nodepotential" : "weighted_sum", # max or weighted_sum - "initialization_method" : "rectangle", # rectangle or datadrive - "num_hmrf_initialization_start" : 0, - "num_hmrf_initialization_end" : 10, - "spatial_weight" : 1.0, - "construct_adjacency_method" : "hexagon", - "construct_adjacency_w" : 1.0, + "n_clones": None, + "n_clones_rdr": 2, + "min_spots_per_clone": 100, + "min_avgumi_per_clone": 10, + "maxspots_pooling": 7, + "tumorprop_threshold": 0.5, + "max_iter_outer": 20, + "nodepotential": "weighted_sum", # max or weighted_sum + "initialization_method": "rectangle", # rectangle or datadrive + "num_hmrf_initialization_start": 0, + "num_hmrf_initialization_end": 10, + "spatial_weight": 1.0, + "construct_adjacency_method": "hexagon", + "construct_adjacency_w": 1.0, # HMM configurations - "n_states" : None, - "params" : "smp", - "t" : 1-1e-5, - "t_phaseing" : 1-1e-4, - "fix_NB_dispersion" : False, - "shared_NB_dispersion" : True, - "fix_BB_dispersion" : False, - "shared_BB_dispersion" : True, - "max_iter" : 30, - "tol" : 1e-4, - "gmm_random_state" : 0, - "np_threshold" : 1.0, - "np_eventminlen" : 10, + "n_states": None, + "params": "smp", + "t": 1 - 1e-5, + "t_phaseing": 1 - 1e-4, + "fix_NB_dispersion": False, + "shared_NB_dispersion": True, + "fix_BB_dispersion": False, + "shared_BB_dispersion": True, + "max_iter": 30, + "tol": 1e-4, + "gmm_random_state": 0, + "np_threshold": 1.0, + "np_eventminlen": 10, # integer copy number - "nonbalance_bafdist" : 1.0, - "nondiploid_rdrdist" : 10.0 + "nonbalance_bafdist": 1.0, + "nondiploid_rdrdist": 10.0, } - argtype_joint = { - "input_filelist" : "str", - "alignment_files" : "list_str" - } - argtype_single = { - "spaceranger_dir" : "str" - } + argtype_joint = {"input_filelist": "str", "alignment_files": "list_str"} + argtype_single = {"spaceranger_dir": "str"} argtype_shared = { - "snp_dir" : "str", - "output_dir" : "str", + "snp_dir": "str", + "output_dir": "str", # supporting files and preprocessing arguments - "geneticmap_file" : "str", - "hgtable_file" : "str", - "normalidx_file" : "str", - "tumorprop_file" : "str", - "supervision_clone_file" : "str", - "filtergenelist_file" : "str", - "filterregion_file" : "str", - "secondary_min_umi" : "int", - "min_snpumi_perspot" : "int", - 'min_percent_expressed_spots' : "float", - "bafonly" : "bool", + "geneticmap_file": "str", + "hgtable_file": "str", + "normalidx_file": "str", + "tumorprop_file": "str", + "supervision_clone_file": "str", + "filtergenelist_file": "str", + "filterregion_file": "str", + "secondary_min_umi": "int", + "min_snpumi_perspot": "int", + "min_percent_expressed_spots": "float", + "bafonly": "bool", # phase switch probability - "nu" : "float", - "logphase_shift" : "float", - "npart_phasing" : "int", + "nu": "float", + "logphase_shift": "float", + "npart_phasing": "int", # HMRF configurations - "n_clones" : "int", - "n_clones_rdr" : "int", - "min_spots_per_clone" : "int", - "min_avgumi_per_clone" : "int", - "maxspots_pooling" : "int", - "tumorprop_threshold" : "float", - "max_iter_outer" : "int", - "nodepotential" : "str", - "initialization_method" : "str", - "num_hmrf_initialization_start" : "int", - "num_hmrf_initialization_end" : "int", - "spatial_weight" : "float", - "construct_adjacency_method" : "str", - "construct_adjacency_w" : "float", + "n_clones": "int", + "n_clones_rdr": "int", + "min_spots_per_clone": "int", + "min_avgumi_per_clone": "int", + "maxspots_pooling": "int", + "tumorprop_threshold": "float", + "max_iter_outer": "int", + "nodepotential": "str", + "initialization_method": "str", + "num_hmrf_initialization_start": "int", + "num_hmrf_initialization_end": "int", + "spatial_weight": "float", + "construct_adjacency_method": "str", + "construct_adjacency_w": "float", # HMM configurations - "n_states" : "int", - "params" : "str", - "t" : "eval", - "t_phaseing" : "eval", - "fix_NB_dispersion" : "bool", - "shared_NB_dispersion" : "bool", - "fix_BB_dispersion" : "bool", - "shared_BB_dispersion" : "bool", - "max_iter" : "int", - "tol" : "float", - "gmm_random_state" : "int", - "np_threshold" : "float", - "np_eventminlen" : "int", + "n_states": "int", + "params": "str", + "t": "eval", + "t_phaseing": "eval", + "fix_NB_dispersion": "bool", + "shared_NB_dispersion": "bool", + "fix_BB_dispersion": "bool", + "shared_BB_dispersion": "bool", + "max_iter": "int", + "tol": "float", + "gmm_random_state": "int", + "np_threshold": "float", + "np_eventminlen": "int", # integer copy number - "nonbalance_bafdist" : "float", - "nondiploid_rdrdist" : "float" + "nonbalance_bafdist": "float", + "nondiploid_rdrdist": "float", } - category_names = ["", "# supporting files and preprocessing arguments", "# phase switch probability", "# HMRF configurations", "# HMM configurations", "# integer copy number"] - category_elements = [["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], \ - ["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \ - ["nu", "logphase_shift", "npart_phasing"], \ - ["n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold", "max_iter_outer", "nodepotential", "initialization_method", "num_hmrf_initialization_start", "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", "construct_adjacency_w"], \ - ["n_states", "params", "t", "t_phaseing", "fix_NB_dispersion", "shared_NB_dispersion", "fix_BB_dispersion", "shared_BB_dispersion", "max_iter", "tol", "gmm_random_state", "np_threshold", "np_eventminlen"], \ - ["nonbalance_bafdist", "nondiploid_rdrdist"]] - return config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, category_names, category_elements + category_names = [ + "", + "# supporting files and preprocessing arguments", + "# phase switch probability", + "# HMRF configurations", + "# HMM configurations", + "# integer copy number", + ] + category_elements = [ + ["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], + [ + "geneticmap_file", + "hgtable_file", + "normalidx_file", + "tumorprop_file", + "alignment_files", + "supervision_clone_file", + "filtergenelist_file", + "filterregion_file", + "secondary_min_umi", + "min_snpumi_perspot", + "min_percent_expressed_spots", + "bafonly", + ], + ["nu", "logphase_shift", "npart_phasing"], + [ + "n_clones", + "n_clones_rdr", + "min_spots_per_clone", + "min_avgumi_per_clone", + "maxspots_pooling", + "tumorprop_threshold", + "max_iter_outer", + "nodepotential", + "initialization_method", + "num_hmrf_initialization_start", + "num_hmrf_initialization_end", + "spatial_weight", + "construct_adjacency_method", + "construct_adjacency_w", + ], + [ + "n_states", + "params", + "t", + "t_phaseing", + "fix_NB_dispersion", + "shared_NB_dispersion", + "fix_BB_dispersion", + "shared_BB_dispersion", + "max_iter", + "tol", + "gmm_random_state", + "np_threshold", + "np_eventminlen", + ], + ["nonbalance_bafdist", "nondiploid_rdrdist"], + ] + return ( + config_shared, + config_joint, + config_single, + argtype_shared, + argtype_joint, + argtype_single, + category_names, + category_elements, + ) def read_configuration_file(filename): ##### [Default settings] ##### - config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() + ( + config_shared, + config_joint, + config_single, + argtype_shared, + argtype_joint, + argtype_single, + _, + _, + ) = load_default_config() config = {**config_shared, **config_single} argument_type = {**argtype_shared, **argtype_single} ##### [ read configuration file to update settings ] ##### - with open(filename, 'r') as fp: + with open(filename, "r") as fp: for line in fp: if line.strip() == "" or line[0] == "#": continue @@ -153,7 +217,9 @@ def read_configuration_file(filename): # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" if (not strs[0] in config.keys()) and (not strs[0] in config_joint.keys()): # warning that the argument is not a valid configuration parameter and continue - logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}") + logger.warning( + f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" + ) continue if len(strs) == 1: config[strs[0]] = [] @@ -168,7 +234,7 @@ def read_configuration_file(filename): elif argument_type[strs[0]] == "eval": config[strs[0]] = eval(strs[1]) elif argument_type[strs[0]] == "bool": - config[strs[0]] = (strs[1].upper() == "TRUE") + config[strs[0]] = strs[1].upper() == "TRUE" elif argument_type[strs[0]] == "list_str": config[strs[0]] = strs[1].split(" ") # assertions @@ -181,12 +247,21 @@ def read_configuration_file(filename): def read_joint_configuration_file(filename): ##### [Default settings] ##### - config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() + ( + config_shared, + config_joint, + config_single, + argtype_shared, + argtype_joint, + argtype_single, + _, + _, + ) = load_default_config() config = {**config_shared, **config_joint} argument_type = {**argtype_shared, **argtype_joint} ##### [ read configuration file to update settings ] ##### - with open(filename, 'r') as fp: + with open(filename, "r") as fp: for line in fp: if line.strip() == "" or line[0] == "#": continue @@ -194,7 +269,9 @@ def read_joint_configuration_file(filename): # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" if (not strs[0] in config.keys()) and (not strs[0] in config_single.keys()): # warning that the argument is not a valid configuration parameter and continue - logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}") + logger.warning( + f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" + ) continue if len(strs) == 1: config[strs[0]] = [] @@ -209,7 +286,7 @@ def read_joint_configuration_file(filename): elif argument_type[strs[0]] == "eval": config[strs[0]] = eval(strs[1]) elif argument_type[strs[0]] == "bool": - config[strs[0]] = (strs[1].upper() == "TRUE") + config[strs[0]] = strs[1].upper() == "TRUE" elif argument_type[strs[0]] == "list_str": config[strs[0]] = strs[1].split(" ") # assertions @@ -221,9 +298,18 @@ def read_joint_configuration_file(filename): def write_config_file(outputfilename, config): - _,_,_, argtype_shared, argtype_joint, argtype_single, category_names, category_elements = load_default_config() + ( + _, + _, + _, + argtype_shared, + argtype_joint, + argtype_single, + category_names, + category_elements, + ) = load_default_config() argument_type = {**argtype_shared, **argtype_joint, **argtype_single} - with open(outputfilename, 'w') as fp: + with open(outputfilename, "w") as fp: for i in range(len(category_names)): fp.write(f"{category_names[i]}\n") for k in category_elements[i]: @@ -236,13 +322,31 @@ def write_config_file(outputfilename, config): def get_default_config_single(): - config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() + ( + config_shared, + config_joint, + config_single, + argtype_shared, + argtype_joint, + argtype_single, + _, + _, + ) = load_default_config() config = {**config_shared, **config_single} return config def get_default_config_joint(): - config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() + ( + config_shared, + config_joint, + config_single, + argtype_shared, + argtype_joint, + argtype_single, + _, + _, + ) = load_default_config() config = {**config_shared, **config_joint} return config @@ -259,7 +363,7 @@ def main(argv): for r in range(hmrf_seed_s, hmrf_seed_t): config["num_hmrf_initialization_start"] = r - config["num_hmrf_initialization_end"] = r+1 + config["num_hmrf_initialization_end"] = r + 1 write_config_file(f"{outputdir}/configfile{r}", config) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 8b144df..d64c102 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -8,7 +8,12 @@ import scanpy as sc import anndata import logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) logger = logging.getLogger() import copy from pathlib import Path @@ -40,88 +45,225 @@ def main(configuration_file): # load data ## If the data is loaded for the first time: infer phasing using phase-switch HMM (hmm_NB_BB_phaseswitch.py and phasing.py) -> output initial_phase.npz, matrices in parsed_inputs folder ## If the data is already loaded: load the matrices from parsed_inputs folder - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ - barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) - + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + df_bininfo, + df_gene_snp, + barcodes, + coords, + single_tumor_prop, + sample_list, + sample_ids, + adjacency_mat, + smooth_mat, + exp_counts, + ) = run_parse_n_load(config) + """ Initial clustering spots using only BAF values. """ # setting transcript count to 0, and baseline so that emission probability calculation will ignore them. - copy_single_X_rdr = copy.copy(single_X[:,0,:]) + copy_single_X_rdr = copy.copy(single_X[:, 0, :]) copy_single_base_nb_mean = copy.copy(single_base_nb_mean) - single_X[:,0,:] = 0 - single_base_nb_mean[:,:] = 0 - + single_X[:, 0, :] = 0 + single_base_nb_mean[:, :] = 0 + # run HMRF - for r_hmrf_initialization in range(config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]): + for r_hmrf_initialization in range( + config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"] + ): outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}" if config["tumorprop_file"] is None: - initial_clone_index = rectangle_initialize_initial_clone(coords, config["n_clones"], random_state=r_hmrf_initialization) + initial_clone_index = rectangle_initialize_initial_clone( + coords, config["n_clones"], random_state=r_hmrf_initialization + ) else: - initial_clone_index = rectangle_initialize_initial_clone_mix(coords, config["n_clones"], single_tumor_prop, threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization) + initial_clone_index = rectangle_initialize_initial_clone_mix( + coords, + config["n_clones"], + single_tumor_prop, + threshold=config["tumorprop_threshold"], + random_state=r_hmrf_initialization, + ) # create directory - p = subprocess.Popen(f"mkdir -p {outdir}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out,err = p.communicate() + p = subprocess.Popen( + f"mkdir -p {outdir}", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) + out, err = p.communicate() # save clone initialization into npz file prefix = "allspots" if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz").exists(): initial_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"num_iterations":0, "round-1_assignment":initial_assignment} + allres = {"num_iterations": 0, "round-1_assignment": initial_assignment} np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres) # run HMRF + HMM # store the results of each iteration of HMRF in a npz file outdir/prefix_nstates{config['n_states']}_sp.npz # if a specific iteration is computed, hmrf will directly load the results from the file if config["tumorprop_file"] is None: - hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"]) + hmrf_concatenate_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat, + adjacency_mat=adjacency_mat, + sample_ids=sample_ids, + max_iter_outer=config["max_iter_outer"], + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="sp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + ) else: - hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"]) - + hmrfmix_concatenate_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat, + adjacency_mat=adjacency_mat, + sample_ids=sample_ids, + max_iter_outer=config["max_iter_outer"], + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="sp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + tumorprop_threshold=config["tumorprop_threshold"], + ) + # merge by thresholding BAF profile similarity - res = load_hmrf_last_iteration(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz") + res = load_hmrf_last_iteration( + f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz" + ) n_obs = single_X.shape[0] if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))], single_tumor_prop, threshold=config["tumorprop_threshold"]) - tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ], + single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) + tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) # merge "similar" clones from the initial number of clones. # "similar" defined by Neyman Pearson statistics/ Likelihood ratios P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B) - merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="sp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2) + merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson( + X, + base_nb_mean, + total_bb_RD, + res, + threshold=config["np_threshold"], + minlength=config["np_eventminlen"], + params="sp", + tumor_prop=tumor_prop, + hmmclass=hmm_nophasing_v2, + ) print(f"BAF clone merging after comparing similarity: {merging_groups}") # if config["tumorprop_file"] is None: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD, + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs, + ) else: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"]) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD, + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs, + single_tumor_prop=single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) print(f"BAF clone merging after requiring minimum # spots: {merging_groups}") n_baf_clones = len(merging_groups) - np.savez(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res) + np.savez( + f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res + ) # load merged results n_obs = single_X.shape[0] - merged_res = dict(np.load(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", allow_pickle=True)) + merged_res = dict( + np.load( + f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", + allow_pickle=True, + ) + ) merged_baf_assignment = copy.copy(merged_res["new_assignment"]) n_baf_clones = len(np.unique(merged_baf_assignment)) pred = np.argmax(merged_res["log_gamma"], axis=0) - pred = np.array([ pred[(c*n_obs):(c*n_obs+n_obs)] for c in range(n_baf_clones) ]) - merged_baf_profiles = np.array([ np.where(pred[c,:] < config["n_states"], merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0], 1-merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0]) \ - for c in range(n_baf_clones) ]) - + pred = np.array( + [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)] + ) + merged_baf_profiles = np.array( + [ + np.where( + pred[c, :] < config["n_states"], + merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0], + 1 - merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0], + ) + for c in range(n_baf_clones) + ] + ) + """ Refined clustering using BAF and RDR values. """ @@ -129,202 +271,600 @@ def main(configuration_file): if not config["bafonly"]: # Only used when assuming each spot is pure normal or tumor and if we don't know which spots are normal spots. # select normal spots - if (config["normalidx_file"] is None) and (config["tumorprop_file"] is None): + if (config["normalidx_file"] is None) and ( + config["tumorprop_file"] is None + ): EPS_BAF = 0.05 PERCENT_NORMAL = 40 vec_stds = np.std(np.log1p(copy_single_X_rdr @ smooth_mat), axis=0) - id_nearnormal_clone = np.argmin(np.sum( np.maximum(np.abs(merged_baf_profiles - 0.5)-EPS_BAF, 0), axis=1)) + id_nearnormal_clone = np.argmin( + np.sum( + np.maximum(np.abs(merged_baf_profiles - 0.5) - EPS_BAF, 0), + axis=1, + ) + ) while True: - stdthreshold = np.percentile(vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], PERCENT_NORMAL) - normal_candidate = (vec_stds < stdthreshold) & (merged_res["new_assignment"] == id_nearnormal_clone) - if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200 or PERCENT_NORMAL == 100: + stdthreshold = np.percentile( + vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], + PERCENT_NORMAL, + ) + normal_candidate = (vec_stds < stdthreshold) & ( + merged_res["new_assignment"] == id_nearnormal_clone + ) + if ( + np.sum(copy_single_X_rdr[:, (normal_candidate == True)]) + > single_X.shape[0] * 200 + or PERCENT_NORMAL == 100 + ): break PERCENT_NORMAL += 10 - pd.Series(barcodes[normal_candidate==True].index).to_csv(f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False) + pd.Series(barcodes[normal_candidate == True].index).to_csv( + f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False + ) - elif (not config["normalidx_file"] is None): + elif not config["normalidx_file"] is None: # single_base_nb_mean has already been added in loading data step. if not config["tumorprop_file"] is None: - logger.warning(f"Mixed sources of information for normal spots! Using {config['normalidx_file']}") - + logger.warning( + f"Mixed sources of information for normal spots! Using {config['normalidx_file']}" + ) + # If tumor purity is provided, we can use it to select normal spots. else: for prop_threshold in np.arange(0.05, 0.6, 0.05): - normal_candidate = (single_tumor_prop < prop_threshold) - if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200: + normal_candidate = single_tumor_prop < prop_threshold + if ( + np.sum(copy_single_X_rdr[:, (normal_candidate == True)]) + > single_X.shape[0] * 200 + ): break # To avoid allele-specific expression that are not relevant to CNA, filter bins where normal pseudobulk has large |BAF - 0.5| index_normal = np.where(normal_candidate)[0] - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ - single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file']) + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + df_gene_snp, + ) = bin_selection_basedon_normal( + df_gene_snp, + single_X, + single_base_nb_mean, + single_total_bb_RD, + config["nu"], + config["logphase_shift"], + index_normal, + config["geneticmap_file"], + ) assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0] df_bininfo = genesnp_to_bininfo(df_gene_snp) - copy_single_X_rdr = copy.copy(single_X[:,0,:]) + copy_single_X_rdr = copy.copy(single_X[:, 0, :]) # If a gene has way higher expression than adjacent genes, its transcript count will dominate RDR values # To avoid the domination, filter out high-UMI DE genes, which may bias RDR estimates # Assume the remaining genes will still carry the CNA info. - copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=sample_list, sample_ids=sample_ids) + copy_single_X_rdr, _ = filter_de_genes_tri( + exp_counts, + df_bininfo, + normal_candidate, + sample_list=sample_list, + sample_ids=sample_ids, + ) MIN_NORMAL_COUNT_PERBIN = 20 - bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0] - rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) + bidx_inconfident = np.where( + np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1) + < MIN_NORMAL_COUNT_PERBIN + )[0] + rdr_normal = np.sum( + copy_single_X_rdr[:, (normal_candidate == True)], axis=1 + ) rdr_normal[bidx_inconfident] = 0 rdr_normal = rdr_normal / np.sum(rdr_normal) - copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin. - copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1) - + copy_single_X_rdr[bidx_inconfident, :] = ( + 0 # avoid ill-defined distributions if normal has 0 count in that bin. + ) + copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum( + copy_single_X_rdr, axis=0 + ).reshape(1, -1) + # adding back RDR signal - single_X[:,0,:] = copy_single_X_rdr + single_X[:, 0, :] = copy_single_X_rdr single_base_nb_mean = copy_single_base_nb_mean n_obs = single_X.shape[0] # save binned data - np.savez(f"{outdir}/binned_data.npz", lengths=lengths, single_X=single_X, single_base_nb_mean=single_base_nb_mean, single_total_bb_RD=single_total_bb_RD, log_sitewise_transmat=log_sitewise_transmat, single_tumor_prop=(None if config["tumorprop_file"] is None else single_tumor_prop)) + np.savez( + f"{outdir}/binned_data.npz", + lengths=lengths, + single_X=single_X, + single_base_nb_mean=single_base_nb_mean, + single_total_bb_RD=single_total_bb_RD, + log_sitewise_transmat=log_sitewise_transmat, + single_tumor_prop=( + None if config["tumorprop_file"] is None else single_tumor_prop + ), + ) # run HMRF on each clone individually to further split BAF clone by RDR+BAF signal for bafc in range(n_baf_clones): prefix = f"clone{bafc}" idx_spots = np.where(merged_baf_assignment == bafc)[0] - if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: # put a minimum B allele read count on pseudobulk to split clones + if ( + np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20 + ): # put a minimum B allele read count on pseudobulk to split clones continue # initialize clone # write the initialization in a npz file outdir/prefix_nstates{config['n_states']}_smp.npz if config["tumorprop_file"] is None: - initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], config['n_clones_rdr'], random_state=r_hmrf_initialization) + initial_clone_index = rectangle_initialize_initial_clone( + coords[idx_spots], + config["n_clones_rdr"], + random_state=r_hmrf_initialization, + ) else: - initial_clone_index = rectangle_initialize_initial_clone_mix(coords[idx_spots], config['n_clones_rdr'], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization) - if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz").exists(): + initial_clone_index = rectangle_initialize_initial_clone_mix( + coords[idx_spots], + config["n_clones_rdr"], + single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + random_state=r_hmrf_initialization, + ) + if not Path( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz" + ).exists(): initial_assignment = np.zeros(len(idx_spots), dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment} - np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", **allres) - + allres = { + "barcodes": barcodes[idx_spots], + "num_iterations": 0, + "round-1_assignment": initial_assignment, + } + np.savez( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", + **allres, + ) + # HMRF + HMM using RDR information copy_slice_sample_ids = copy.copy(sample_ids[idx_spots]) if config["tumorprop_file"] is None: - hmrf_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"]) + hmrf_concatenate_pipeline( + outdir, + prefix, + single_X[:, :, idx_spots], + lengths, + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], + adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], + sample_ids=copy_slice_sample_ids, + max_iter_outer=10, + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + ) else: - hmrfmix_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], single_tumor_prop[idx_spots], initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"]) + hmrfmix_concatenate_pipeline( + outdir, + prefix, + single_X[:, :, idx_spots], + lengths, + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + single_tumor_prop[idx_spots], + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], + adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], + sample_ids=copy_slice_sample_ids, + max_iter_outer=10, + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + tumorprop_threshold=config["tumorprop_threshold"], + ) ##### combine results across clones ##### - res_combine = {"prev_assignment":np.zeros(single_X.shape[2], dtype=int)} + res_combine = {"prev_assignment": np.zeros(single_X.shape[2], dtype=int)} offset_clone = 0 for bafc in range(n_baf_clones): prefix = f"clone{bafc}" - allres = dict( np.load(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", allow_pickle=True) ) + allres = dict( + np.load( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) r = allres["num_iterations"] - 1 - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} - idx_spots = np.where(barcodes.isin( allres["barcodes"] ))[0] + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } + idx_spots = np.where(barcodes.isin(allres["barcodes"]))[0] if len(np.unique(res["new_assignment"])) == 1: n_merged_clones = 1 c = res["new_assignment"][0] merged_res = copy.copy(res) merged_res["new_assignment"] = np.zeros(len(idx_spots), dtype=int) try: - log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((2*config["n_states"], n_obs, 1)) + log_gamma = res["log_gamma"][ + :, (c * n_obs) : (c * n_obs + n_obs) + ].reshape((2 * config["n_states"], n_obs, 1)) except: - log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((config["n_states"], n_obs, 1)) - pred_cnv = res["pred_cnv"][ (c*n_obs):(c*n_obs+n_obs) ].reshape((-1,1)) + log_gamma = res["log_gamma"][ + :, (c * n_obs) : (c * n_obs + n_obs) + ].reshape((config["n_states"], n_obs, 1)) + pred_cnv = res["pred_cnv"][ + (c * n_obs) : (c * n_obs + n_obs) + ].reshape((-1, 1)) else: if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"])) ]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"])) ], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"]) - tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) - merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="smp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ], + single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + ) + ) + tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + merging_groups, merged_res = ( + similarity_components_rdrbaf_neymanpearson( + X, + base_nb_mean, + total_bb_RD, + res, + threshold=config["np_threshold"], + minlength=config["np_eventminlen"], + params="smp", + tumor_prop=tumor_prop, + hmmclass=hmm_nophasing_v2, + ) + ) print(f"part {bafc} merging_groups: {merging_groups}") # if config["tumorprop_file"] is None: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD[:, idx_spots], + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] + * n_obs, + ) else: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"]) - print(f"part {bafc} merging after requiring minimum # spots: {merging_groups}") + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD[:, idx_spots], + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] + * n_obs, + single_tumor_prop=single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + ) + print( + f"part {bafc} merging after requiring minimum # spots: {merging_groups}" + ) # compute posterior using the newly merged pseudobulk n_merged_clones = len(merging_groups) tmp = copy.copy(merged_res["new_assignment"]) if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(merged_res["new_assignment"] == c)[0] + for c in range(n_merged_clones) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"]) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(merged_res["new_assignment"] == c)[0] + for c in range(n_merged_clones) + ], + single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + ) + ) # - merged_res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), config["n_states"], \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, \ - hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, init_log_mu=res["new_log_mu"], init_p_binom=res["new_p_binom"], init_alphas=res["new_alphas"], init_taus=res["new_taus"], max_iter=config["max_iter"], tol=config["tol"], lambd=np.sum(base_nb_mean,axis=1)/np.sum(base_nb_mean), sample_length=np.ones(X.shape[2],dtype=int)*X.shape[0]) + merged_res = pipeline_baum_welch( + None, + np.vstack( + [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")] + ).T.reshape(-1, 2, 1), + np.tile(lengths, X.shape[2]), + config["n_states"], + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + np.tile(log_sitewise_transmat, X.shape[2]), + ( + np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + if not tumor_prop is None + else None + ), + hmmclass=hmm_nophasing_v2, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + init_log_mu=res["new_log_mu"], + init_p_binom=res["new_p_binom"], + init_alphas=res["new_alphas"], + init_taus=res["new_taus"], + max_iter=config["max_iter"], + tol=config["tol"], + lambd=np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean), + sample_length=np.ones(X.shape[2], dtype=int) * X.shape[0], + ) merged_res["new_assignment"] = copy.copy(tmp) - merged_res = combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, merged_res, params="smp", tumor_prop=np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, hmmclass=hmm_nophasing_v2, merge_threshold=0.1) - log_gamma = np.stack([ merged_res["log_gamma"][:,(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ], axis=-1) - pred_cnv = np.vstack([ merged_res["pred_cnv"][(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ]).T + merged_res = combine_similar_states_across_clones( + X, + base_nb_mean, + total_bb_RD, + merged_res, + params="smp", + tumor_prop=( + np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + if not tumor_prop is None + else None + ), + hmmclass=hmm_nophasing_v2, + merge_threshold=0.1, + ) + log_gamma = np.stack( + [ + merged_res["log_gamma"][ + :, (c * n_obs) : (c * n_obs + n_obs) + ] + for c in range(n_merged_clones) + ], + axis=-1, + ) + pred_cnv = np.vstack( + [ + merged_res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] + for c in range(n_merged_clones) + ] + ).T # # add to res_combine if len(res_combine) == 1: - res_combine.update({"new_log_mu":np.hstack([ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([ merged_res["new_alphas"] ] * n_merged_clones), \ - "new_p_binom":np.hstack([ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([ merged_res["new_taus"] ] * n_merged_clones), \ - "log_gamma":log_gamma, "pred_cnv":pred_cnv}) + res_combine.update( + { + "new_log_mu": np.hstack( + [merged_res["new_log_mu"]] * n_merged_clones + ), + "new_alphas": np.hstack( + [merged_res["new_alphas"]] * n_merged_clones + ), + "new_p_binom": np.hstack( + [merged_res["new_p_binom"]] * n_merged_clones + ), + "new_taus": np.hstack( + [merged_res["new_taus"]] * n_merged_clones + ), + "log_gamma": log_gamma, + "pred_cnv": pred_cnv, + } + ) else: - res_combine.update({"new_log_mu":np.hstack([res_combine["new_log_mu"]] + [ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([res_combine["new_alphas"]] + [ merged_res["new_alphas"] ] * n_merged_clones), \ - "new_p_binom":np.hstack([res_combine["new_p_binom"]] + [ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([res_combine["new_taus"]] + [ merged_res["new_taus"] ] * n_merged_clones), \ - "log_gamma":np.dstack([res_combine["log_gamma"], log_gamma ]), "pred_cnv":np.hstack([res_combine["pred_cnv"], pred_cnv])}) - res_combine["prev_assignment"][idx_spots] = merged_res["new_assignment"] + offset_clone + res_combine.update( + { + "new_log_mu": np.hstack( + [res_combine["new_log_mu"]] + + [merged_res["new_log_mu"]] * n_merged_clones + ), + "new_alphas": np.hstack( + [res_combine["new_alphas"]] + + [merged_res["new_alphas"]] * n_merged_clones + ), + "new_p_binom": np.hstack( + [res_combine["new_p_binom"]] + + [merged_res["new_p_binom"]] * n_merged_clones + ), + "new_taus": np.hstack( + [res_combine["new_taus"]] + + [merged_res["new_taus"]] * n_merged_clones + ), + "log_gamma": np.dstack( + [res_combine["log_gamma"], log_gamma] + ), + "pred_cnv": np.hstack([res_combine["pred_cnv"], pred_cnv]), + } + ) + res_combine["prev_assignment"][idx_spots] = ( + merged_res["new_assignment"] + offset_clone + ) offset_clone += n_merged_clones # temp: make dispersions the same across all clones - res_combine["new_alphas"][:,:] = np.max(res_combine["new_alphas"]) - res_combine["new_taus"][:,:] = np.min(res_combine["new_taus"]) + res_combine["new_alphas"][:, :] = np.max(res_combine["new_alphas"]) + res_combine["new_taus"][:, :] = np.min(res_combine["new_taus"]) # end temp n_final_clones = len(np.unique(res_combine["prev_assignment"])) # per-sample weights across clones log_persample_weights = np.zeros((n_final_clones, len(sample_list))) for sidx in range(len(sample_list)): index = np.where(sample_ids == sidx)[0] - this_persample_weight = np.bincount(res_combine["prev_assignment"][index], minlength=n_final_clones) / len(index) - log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) - log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) + this_persample_weight = np.bincount( + res_combine["prev_assignment"][index], minlength=n_final_clones + ) / len(index) + log_persample_weights[:, sidx] = np.where( + this_persample_weight > 0, np.log(this_persample_weight), -50 + ) + log_persample_weights[:, sidx] = log_persample_weights[ + :, sidx + ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) # final re-assignment across all clones using estimated RDR + BAF # The following step may not be needed because of other improvements. And it may cause mistakes in some cases. if config["tumorprop_file"] is None: if config["nodepotential"] == "max": - pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T - new_assignment, single_llf, total_llf, posterior = aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, pred, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + pred = np.vstack( + [ + np.argmax(res_combine["log_gamma"][:, :, c], axis=0) + for c in range(res_combine["log_gamma"].shape[2]) + ] + ).T + new_assignment, single_llf, total_llf, posterior = ( + aggr_hmrf_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res_combine, + pred, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) elif config["nodepotential"] == "weighted_sum": - new_assignment, single_llf, total_llf, posterior = hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + new_assignment, single_llf, total_llf, posterior = ( + hmrf_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res_combine, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) else: if config["nodepotential"] == "max": - pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T - new_assignment, single_llf, total_llf, posterior = aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, pred, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + pred = np.vstack( + [ + np.argmax(res_combine["log_gamma"][:, :, c], axis=0) + for c in range(res_combine["log_gamma"].shape[2]) + ] + ).T + new_assignment, single_llf, total_llf, posterior = ( + aggr_hmrfmix_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res_combine, + pred, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) elif config["nodepotential"] == "weighted_sum": - new_assignment, single_llf, total_llf, posterior = hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + new_assignment, single_llf, total_llf, posterior = ( + hmrfmix_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res_combine, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) res_combine["total_llf"] = total_llf res_combine["new_assignment"] = new_assignment # re-order clones such that normal clones are always clone 0 - res_combine, posterior = reorder_results(res_combine, posterior, single_tumor_prop) + res_combine, posterior = reorder_results( + res_combine, posterior, single_tumor_prop + ) # save results - np.savez(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine) + np.savez( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + **res_combine, + ) np.save(f"{outdir}/posterior_clone_probability.npy", posterior) - + ##### infer integer copy ##### - res_combine = dict(np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True)) + res_combine = dict( + np.load( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) final_clone_ids = np.sort(np.unique(res_combine["new_assignment"])) nonempty_clone_ids = copy.copy(final_clone_ids) # add clone 0 as normal clone if it doesn't appear in final_clone_ids @@ -332,7 +872,7 @@ def main(configuration_file): final_clone_ids = np.append(0, final_clone_ids) # chr position medfix = ["", "_diploid", "_triploid", "_tetraploid"] - for o,max_medploidy in enumerate([None, 2, 3, 4]): + for o, max_medploidy in enumerate([None, 2, 3, 4]): # A/B copy number per bin allele_specific_copy = [] # A/B copy number per state @@ -340,61 +880,204 @@ def main(configuration_file): df_genelevel_cnv = None if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res_combine["new_assignment"] == cid)[0] + for cid in final_clone_ids + ], + ) else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids], single_tumor_prop, threshold=config["tumorprop_threshold"]) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res_combine["new_assignment"] == cid)[0] + for cid in final_clone_ids + ], + single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) + ) for s, cid in enumerate(final_clone_ids): - if np.sum(base_nb_mean[:,s]) == 0: + if np.sum(base_nb_mean[:, s]) == 0: continue # adjust log_mu such that sum_bin lambda * np.exp(log_mu) = 1 - lambd = base_nb_mean[:,s] / np.sum(base_nb_mean[:,s]) - this_pred_cnv = res_combine["pred_cnv"][:,s] - adjusted_log_mu = np.log( np.exp(res_combine["new_log_mu"][:,s]) / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv,s]) * lambd) ) + lambd = base_nb_mean[:, s] / np.sum(base_nb_mean[:, s]) + this_pred_cnv = res_combine["pred_cnv"][:, s] + adjusted_log_mu = np.log( + np.exp(res_combine["new_log_mu"][:, s]) + / np.sum( + np.exp(res_combine["new_log_mu"][this_pred_cnv, s]) * lambd + ) + ) if not max_medploidy is None: - best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, max_medploidy=max_medploidy) + best_integer_copies, _ = ( + hill_climbing_integer_copynumber_oneclone( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + max_medploidy=max_medploidy, + ) + ) else: try: - best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"]) + best_integer_copies, _ = ( + hill_climbing_integer_copynumber_fixdiploid( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + nonbalance_bafdist=config["nonbalance_bafdist"], + nondiploid_rdrdist=config["nondiploid_rdrdist"], + ) + ) except: try: - best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"], min_prop_threshold=0.02) + best_integer_copies, _ = ( + hill_climbing_integer_copynumber_fixdiploid( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + nonbalance_bafdist=config["nonbalance_bafdist"], + nondiploid_rdrdist=config["nondiploid_rdrdist"], + min_prop_threshold=0.02, + ) + ) except: finding_distate_failed = True continue - print(f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}") + print( + f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}" + ) # - allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 0].reshape(1,-1), index=[f"clone{cid} A"], columns=np.arange(n_obs) ) ) - allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 1].reshape(1,-1), index=[f"clone{cid} B"], columns=np.arange(n_obs) ) ) + allele_specific_copy.append( + pd.DataFrame( + best_integer_copies[ + res_combine["pred_cnv"][:, s], 0 + ].reshape(1, -1), + index=[f"clone{cid} A"], + columns=np.arange(n_obs), + ) + ) + allele_specific_copy.append( + pd.DataFrame( + best_integer_copies[ + res_combine["pred_cnv"][:, s], 1 + ].reshape(1, -1), + index=[f"clone{cid} B"], + columns=np.arange(n_obs), + ) + ) # - state_cnv.append( pd.DataFrame( res_combine["new_log_mu"][:,s].reshape(-1,1), columns=[f"clone{cid} logmu"], index=np.arange(config['n_states']) ) ) - state_cnv.append( pd.DataFrame( res_combine["new_p_binom"][:,s].reshape(-1,1), columns=[f"clone{cid} p"], index=np.arange(config['n_states']) ) ) - state_cnv.append( pd.DataFrame( best_integer_copies[:,0].reshape(-1,1), columns=[f"clone{cid} A"], index=np.arange(config['n_states']) ) ) - state_cnv.append( pd.DataFrame( best_integer_copies[:,1].reshape(-1,1), columns=[f"clone{cid} B"], index=np.arange(config['n_states']) ) ) + state_cnv.append( + pd.DataFrame( + res_combine["new_log_mu"][:, s].reshape(-1, 1), + columns=[f"clone{cid} logmu"], + index=np.arange(config["n_states"]), + ) + ) + state_cnv.append( + pd.DataFrame( + res_combine["new_p_binom"][:, s].reshape(-1, 1), + columns=[f"clone{cid} p"], + index=np.arange(config["n_states"]), + ) + ) + state_cnv.append( + pd.DataFrame( + best_integer_copies[:, 0].reshape(-1, 1), + columns=[f"clone{cid} A"], + index=np.arange(config["n_states"]), + ) + ) + state_cnv.append( + pd.DataFrame( + best_integer_copies[:, 1].reshape(-1, 1), + columns=[f"clone{cid} B"], + index=np.arange(config["n_states"]), + ) + ) # # tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list) # tmpdf.columns = [f"clone{s} A", f"clone{s} B"] - bin_Acopy_mappers = {i:x for i,x in enumerate(best_integer_copies[res_combine["pred_cnv"][:,s], 0])} - bin_Bcopy_mappers = {i:x for i,x in enumerate(best_integer_copies[res_combine["pred_cnv"][:,s], 1])} - tmpdf = pd.DataFrame({"gene":df_gene_snp[df_gene_snp.is_interval].gene, f"clone{s} A":df_gene_snp[df_gene_snp.is_interval]['bin_id'].map(bin_Acopy_mappers), \ - f"clone{s} B":df_gene_snp[df_gene_snp.is_interval]['bin_id'].map(bin_Bcopy_mappers)}).set_index('gene') + bin_Acopy_mappers = { + i: x + for i, x in enumerate( + best_integer_copies[res_combine["pred_cnv"][:, s], 0] + ) + } + bin_Bcopy_mappers = { + i: x + for i, x in enumerate( + best_integer_copies[res_combine["pred_cnv"][:, s], 1] + ) + } + tmpdf = pd.DataFrame( + { + "gene": df_gene_snp[df_gene_snp.is_interval].gene, + f"clone{s} A": df_gene_snp[df_gene_snp.is_interval][ + "bin_id" + ].map(bin_Acopy_mappers), + f"clone{s} B": df_gene_snp[df_gene_snp.is_interval][ + "bin_id" + ].map(bin_Bcopy_mappers), + } + ).set_index("gene") if df_genelevel_cnv is None: - df_genelevel_cnv = copy.copy( tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int) ) + df_genelevel_cnv = copy.copy( + tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int) + ) else: - df_genelevel_cnv = df_genelevel_cnv.join( tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int) ) + df_genelevel_cnv = df_genelevel_cnv.join( + tmpdf[~tmpdf[f"clone{s} A"].isnull()].astype(int) + ) if len(state_cnv) == 0: continue # output gene-level copy number - df_genelevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t") + df_genelevel_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_genelevel.tsv", + header=True, + index=True, + sep="\t", + ) # output segment-level copy number allele_specific_copy = pd.concat(allele_specific_copy) - df_seglevel_cnv = pd.DataFrame({"CHR":df_bininfo.CHR.values, "START":df_bininfo.START.values, "END":df_bininfo.END.values }) - df_seglevel_cnv = df_seglevel_cnv.join( allele_specific_copy.T ) - df_seglevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t") + df_seglevel_cnv = pd.DataFrame( + { + "CHR": df_bininfo.CHR.values, + "START": df_bininfo.START.values, + "END": df_bininfo.END.values, + } + ) + df_seglevel_cnv = df_seglevel_cnv.join(allele_specific_copy.T) + df_seglevel_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_seglevel.tsv", + header=True, + index=False, + sep="\t", + ) # output per-state copy number - state_cnv = functools.reduce(lambda left,right: pd.merge(left,right, left_index=True, right_index=True, how='inner'), state_cnv) - state_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t") + state_cnv = functools.reduce( + lambda left, right: pd.merge( + left, right, left_index=True, right_index=True, how="inner" + ), + state_cnv, + ) + state_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_perstate.tsv", + header=True, + index=False, + sep="\t", + ) # # # # posterior using integer-copy numbers # log_persample_weights = np.zeros((len(nonempty_clone_ids), len(sample_list))) @@ -407,12 +1090,16 @@ def main(configuration_file): # df_posterior = clonelabel_posterior_withinteger(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, state_cnv, res_combine, pred, \ # smooth_mat, adjacency_mat, res_combine["new_assignment"], sample_ids, base_nb_mean, log_persample_weights, config["spatial_weight"], hmmclass=hmm_nophasing_v2) # df_posterior.to_pickle(f"{outdir}/posterior{medfix[o]}.pkl") - + ##### output clone label ##### - df_clone_label = pd.DataFrame({"clone_label":res_combine["new_assignment"]}, index=barcodes) + df_clone_label = pd.DataFrame( + {"clone_label": res_combine["new_assignment"]}, index=barcodes + ) if not config["tumorprop_file"] is None: df_clone_label["tumor_proportion"] = single_tumor_prop - df_clone_label.to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t") + df_clone_label.to_csv( + f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t" + ) ##### plotting ##### # make a directory for plots @@ -421,48 +1108,158 @@ def main(configuration_file): # plot RDR and BAF cn_file = f"{outdir}/cnv_diploid_seglevel.tsv" - fig = plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=30, palette="tab10") - fig.savefig(f"{outdir}/plots/rdr_baf_defaultcolor.pdf", transparent=True, bbox_inches="tight") + fig = plot_rdr_baf( + configuration_file, + r_hmrf_initialization, + cn_file, + clone_ids=None, + remove_xticks=True, + rdr_ylim=5, + chrtext_shift=-0.3, + base_height=3.2, + pointsize=30, + palette="tab10", + ) + fig.savefig( + f"{outdir}/plots/rdr_baf_defaultcolor.pdf", + transparent=True, + bbox_inches="tight", + ) # plot allele-specific copy number - for o,max_medploidy in enumerate([None, 2, 3, 4]): + for o, max_medploidy in enumerate([None, 2, 3, 4]): cn_file = f"{outdir}/cnv{medfix[o]}_seglevel.tsv" if not Path(cn_file).exists(): continue df_cnv = pd.read_csv(cn_file, header=0, sep="\t") df_cnv = expand_df_cnv(df_cnv) - df_cnv = df_cnv[~df_cnv.iloc[:,-1].isnull()] - fig, axes = plt.subplots(1, 1, figsize=(15, 0.9*len(final_clone_ids) + 0.6), dpi=200, facecolor="white") - axes = plot_acn_from_df_anotherscheme(df_cnv, axes, chrbar_pos='top', chrbar_thickness=0.3, add_legend=False, remove_xticks=True) + df_cnv = df_cnv[~df_cnv.iloc[:, -1].isnull()] + fig, axes = plt.subplots( + 1, + 1, + figsize=(15, 0.9 * len(final_clone_ids) + 0.6), + dpi=200, + facecolor="white", + ) + axes = plot_acn_from_df_anotherscheme( + df_cnv, + axes, + chrbar_pos="top", + chrbar_thickness=0.3, + add_legend=False, + remove_xticks=True, + ) fig.tight_layout() - fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}.pdf", transparent=True, bbox_inches="tight") + fig.savefig( + f"{outdir}/plots/acn_genome{medfix[o]}.pdf", + transparent=True, + bbox_inches="tight", + ) # additionally plot the allele-specific copy number per region if not config["supervision_clone_file"] is None: - fig, axes = plt.subplots(1, 1, figsize=(15, 0.6*len(unique_clone_ids) + 0.4), dpi=200, facecolor="white") + fig, axes = plt.subplots( + 1, + 1, + figsize=(15, 0.6 * len(unique_clone_ids) + 0.4), + dpi=200, + facecolor="white", + ) merged_df_cnv = pd.read_csv(cn_file, header=0, sep="\t") df_cnv = merged_df_cnv[["CHR", "START", "END"]] - df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} A":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} A"] for i,x in enumerate(unique_clone_ids)}) ) - df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} B":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} B"] for i,x in enumerate(unique_clone_ids)}) ) + df_cnv = df_cnv.join( + pd.DataFrame( + { + f"clone{x} A": merged_df_cnv[ + f"clone{res_combine['new_assignment'][i]} A" + ] + for i, x in enumerate(unique_clone_ids) + } + ) + ) + df_cnv = df_cnv.join( + pd.DataFrame( + { + f"clone{x} B": merged_df_cnv[ + f"clone{res_combine['new_assignment'][i]} B" + ] + for i, x in enumerate(unique_clone_ids) + } + ) + ) df_cnv = expand_df_cnv(df_cnv) - clone_ids = np.concatenate([ unique_clone_ids[res_combine["new_assignment"]==c].astype(str) for c in final_clone_ids ]) - axes = plot_acn_from_df(df_cnv, axes, clone_ids=clone_ids, clone_names=[f"region {x}" for x in clone_ids], add_chrbar=True, add_arrow=False, chrbar_thickness=0.4/(0.6*len(unique_clone_ids) + 0.4), add_legend=True, remove_xticks=True) + clone_ids = np.concatenate( + [ + unique_clone_ids[res_combine["new_assignment"] == c].astype( + str + ) + for c in final_clone_ids + ] + ) + axes = plot_acn_from_df( + df_cnv, + axes, + clone_ids=clone_ids, + clone_names=[f"region {x}" for x in clone_ids], + add_chrbar=True, + add_arrow=False, + chrbar_thickness=0.4 / (0.6 * len(unique_clone_ids) + 0.4), + add_legend=True, + remove_xticks=True, + ) fig.tight_layout() - fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf", transparent=True, bbox_inches="tight") + fig.savefig( + f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf", + transparent=True, + bbox_inches="tight", + ) # plot clones in space if not config["supervision_clone_file"] is None: before_assignments = pd.Series([None] * before_coords.shape[0]) - for i,c in enumerate(unique_clone_ids): - before_assignments.iloc[before_df_clones.clone_id.isin([c])] = f"clone {res_combine['new_assignment'][i]}" - fig = plot_clones_in_space(before_coords, before_assignments, sample_list, before_sample_ids, palette="Set2", labels=unique_clone_ids, label_coords=coords, label_sample_ids=sample_ids) - fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight") + for i, c in enumerate(unique_clone_ids): + before_assignments.iloc[before_df_clones.clone_id.isin([c])] = ( + f"clone {res_combine['new_assignment'][i]}" + ) + fig = plot_clones_in_space( + before_coords, + before_assignments, + sample_list, + before_sample_ids, + palette="Set2", + labels=unique_clone_ids, + label_coords=coords, + label_sample_ids=sample_ids, + ) + fig.savefig( + f"{outdir}/plots/clone_spatial.pdf", + transparent=True, + bbox_inches="tight", + ) else: - assignment = pd.Series([f"clone {x}" for x in res_combine["new_assignment"]]) - fig = plot_individual_spots_in_space(coords, assignment, single_tumor_prop, sample_list=sample_list, sample_ids=sample_ids) - fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight") - + assignment = pd.Series( + [f"clone {x}" for x in res_combine["new_assignment"]] + ) + fig = plot_individual_spots_in_space( + coords, + assignment, + single_tumor_prop, + sample_list=sample_list, + sample_ids=sample_ids, + ) + fig.savefig( + f"{outdir}/plots/clone_spatial.pdf", + transparent=True, + bbox_inches="tight", + ) + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) + parser.add_argument( + "-c", + "--configfile", + help="configuration file of CalicoST", + required=True, + type=str, + ) args = parser.parse_args() - main(args.configfile) \ No newline at end of file + main(args.configfile) diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py index d872cae..a881fff 100644 --- a/src/calicost/calicost_supervised.py +++ b/src/calicost/calicost_supervised.py @@ -8,7 +8,12 @@ import scanpy as sc import anndata import logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) logger = logging.getLogger() import copy from pathlib import Path @@ -29,9 +34,11 @@ from matplotlib.lines import Line2D import matplotlib.patches as mpatches import seaborn -plt.rcParams.update({'font.size': 14}) + +plt.rcParams.update({"font.size": 14}) import mkl + mkl.set_num_threads(1) @@ -44,125 +51,304 @@ def main(configuration_file): for k in sorted(list(config.keys())): print(f"\t{k} : {config[k]}") - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, x_gene_list, \ - barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) - + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + df_bininfo, + x_gene_list, + barcodes, + coords, + single_tumor_prop, + sample_list, + sample_ids, + adjacency_mat, + smooth_mat, + exp_counts, + ) = run_parse_n_load(config) + # normal baseline expression if tumorprop_file is provided if not config["tumorprop_file"] is None: EXPECTED_NORMAL_PROP = 0.05 - q = np.sort(single_tumor_prop)[ int(EXPECTED_NORMAL_PROP * len(barcodes)) ] - normal_candidate = ( single_tumor_prop <= q ) - + q = np.sort(single_tumor_prop)[int(EXPECTED_NORMAL_PROP * len(barcodes))] + normal_candidate = single_tumor_prop <= q + # copy_single_X_rdr,_ = filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=sample_list, sample_ids=sample_ids) - copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, x_gene_list, normal_candidate, sample_list=sample_list, sample_ids=sample_ids) + copy_single_X_rdr, _ = filter_de_genes_tri( + exp_counts, + x_gene_list, + normal_candidate, + sample_list=sample_list, + sample_ids=sample_ids, + ) MIN_NORMAL_COUNT_PERBIN = 20 - bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0] - rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) + bidx_inconfident = np.where( + np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1) + < MIN_NORMAL_COUNT_PERBIN + )[0] + rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1) rdr_normal[bidx_inconfident] = 0 rdr_normal = rdr_normal / np.sum(rdr_normal) - copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin. - copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1) + copy_single_X_rdr[bidx_inconfident, :] = ( + 0 # avoid ill-defined distributions if normal has 0 count in that bin. + ) + copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum( + copy_single_X_rdr, axis=0 + ).reshape(1, -1) # adding back RDR signal - single_X[:,0,:] = copy_single_X_rdr + single_X[:, 0, :] = copy_single_X_rdr single_base_nb_mean = copy_single_base_nb_mean # make each cluster in supervision_clone_file a pseudospot if not config["supervision_clone_file"] is None: - tmp_df_clones = pd.read_csv(config["supervision_clone_file"], header=0, index_col=0, sep="\t") - df_clones = pd.DataFrame({"barcodes":barcodes.values}, index=barcodes.values).join(tmp_df_clones) + tmp_df_clones = pd.read_csv( + config["supervision_clone_file"], header=0, index_col=0, sep="\t" + ) + df_clones = pd.DataFrame( + {"barcodes": barcodes.values}, index=barcodes.values + ).join(tmp_df_clones) df_clones.columns = ["barcodes", "clone_id"] - - unique_clone_ids = np.unique( df_clones["clone_id"][~df_clones["clone_id"].isnull()].values ) - clone_index = [np.where(df_clones["clone_id"] == c)[0] for c in unique_clone_ids] + + unique_clone_ids = np.unique( + df_clones["clone_id"][~df_clones["clone_id"].isnull()].values + ) + clone_index = [ + np.where(df_clones["clone_id"] == c)[0] for c in unique_clone_ids + ] if config["tumorprop_file"] is None: - single_X, single_base_nb_mean, single_total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index) + single_X, single_base_nb_mean, single_total_bb_RD = ( + merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index + ) + ) single_tumor_prop = None else: - single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=config["tumorprop_threshold"]) + single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) + ) before_coords = copy.copy(coords) before_df_clones = copy.copy(df_clones) before_sample_ids = copy.copy(sample_ids) - coords = np.array([ np.mean(coords[idx,:],axis=0) for idx in clone_index ]) + coords = np.array([np.mean(coords[idx, :], axis=0) for idx in clone_index]) smooth_mat = scipy.sparse.csr_matrix(np.eye(coords.shape[0])) adjacency_mat = scipy.sparse.csr_matrix(np.eye(coords.shape[0])) barcodes = pd.Series(unique_clone_ids) sample_ids = np.array([sample_ids[idx][0] for idx in clone_index]) # clear values in RDR to first infer clones using BAF signal only - copy_single_X_rdr = copy.copy(single_X[:,0,:]) + copy_single_X_rdr = copy.copy(single_X[:, 0, :]) copy_single_base_nb_mean = copy.copy(single_base_nb_mean) - single_X[:,0,:] = 0 - single_base_nb_mean[:,:] = 0 - + single_X[:, 0, :] = 0 + single_base_nb_mean[:, :] = 0 + # run HMRF - for r_hmrf_initialization in range(config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"]): + for r_hmrf_initialization in range( + config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"] + ): outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}" if config["initialization_method"] == "rectangle": if config["tumorprop_file"] is None: - initial_clone_index = rectangle_initialize_initial_clone(coords, min(coords.shape[0],config["n_clones"]), random_state=r_hmrf_initialization) + initial_clone_index = rectangle_initialize_initial_clone( + coords, + min(coords.shape[0], config["n_clones"]), + random_state=r_hmrf_initialization, + ) else: - initial_clone_index = rectangle_initialize_initial_clone_mix(coords, min(coords.shape[0],config["n_clones"]), single_tumor_prop, threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization) + initial_clone_index = rectangle_initialize_initial_clone_mix( + coords, + min(coords.shape[0], config["n_clones"]), + single_tumor_prop, + threshold=config["tumorprop_threshold"], + random_state=r_hmrf_initialization, + ) else: - kmeans = KMeans(n_clusters = config["n_clones"], max_iter=1, init="random", random_state=config["num_hmrf_initialization_start"]).fit(coords) - initial_clone_index = [np.where(kmeans.labels_ == i)[0] for i in range(config["n_clones"])] + kmeans = KMeans( + n_clusters=config["n_clones"], + max_iter=1, + init="random", + random_state=config["num_hmrf_initialization_start"], + ).fit(coords) + initial_clone_index = [ + np.where(kmeans.labels_ == i)[0] for i in range(config["n_clones"]) + ] # create directory - p = subprocess.Popen(f"mkdir -p {outdir}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out,err = p.communicate() + p = subprocess.Popen( + f"mkdir -p {outdir}", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) + out, err = p.communicate() # save clone initialization into npz file prefix = "allspots" if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz").exists(): initial_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"num_iterations":0, "round-1_assignment":initial_assignment} + allres = {"num_iterations": 0, "round-1_assignment": initial_assignment} np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres) # run HMRF + HMM if config["tumorprop_file"] is None: - hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"]) + hmrf_concatenate_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat, + adjacency_mat=adjacency_mat, + sample_ids=sample_ids, + max_iter_outer=config["max_iter_outer"], + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="sp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + ) else: - hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"]) - + hmrfmix_concatenate_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat, + adjacency_mat=adjacency_mat, + sample_ids=sample_ids, + max_iter_outer=config["max_iter_outer"], + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="sp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + tumorprop_threshold=config["tumorprop_threshold"], + ) + # merge by thresholding BAF profile similarity - res = load_hmrf_last_iteration(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz") + res = load_hmrf_last_iteration( + f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz" + ) n_obs = single_X.shape[0] if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res["new_assignment"]==c)[0] for c in np.sort(np.unique(res["new_assignment"]))], single_tumor_prop, threshold=config["tumorprop_threshold"]) - tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) - merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="sp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ], + single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) + tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson( + X, + base_nb_mean, + total_bb_RD, + res, + threshold=config["np_threshold"], + minlength=config["np_eventminlen"], + params="sp", + tumor_prop=tumor_prop, + hmmclass=hmm_nophasing_v2, + ) print(f"BAF clone merging after comparing similarity: {merging_groups}") # if config["tumorprop_file"] is None: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD, + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs, + ) else: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD, + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] * n_obs, + single_tumor_prop=single_tumor_prop, + ) print(f"BAF clone merging after requiring minimum # spots: {merging_groups}") n_baf_clones = len(merging_groups) - np.savez(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res) + np.savez( + f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res + ) # adjust phasing n_obs = single_X.shape[0] - merged_res = dict(np.load(f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", allow_pickle=True)) + merged_res = dict( + np.load( + f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", + allow_pickle=True, + ) + ) merged_baf_assignment = copy.copy(merged_res["new_assignment"]) n_baf_clones = len(np.unique(merged_baf_assignment)) pred = np.argmax(merged_res["log_gamma"], axis=0) - pred = np.array([ pred[(c*n_obs):(c*n_obs+n_obs)] for c in range(n_baf_clones) ]) - merged_baf_profiles = np.array([ np.where(pred[c,:] < config["n_states"], merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0], 1-merged_res["new_p_binom"][pred[c,:]%config["n_states"], 0]) \ - for c in range(n_baf_clones) ]) + pred = np.array( + [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)] + ) + merged_baf_profiles = np.array( + [ + np.where( + pred[c, :] < config["n_states"], + merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0], + 1 - merged_res["new_p_binom"][pred[c, :] % config["n_states"], 0], + ) + for c in range(n_baf_clones) + ] + ) # EPS_BAF = 0.05 # merged_baf_profiles[np.abs(merged_baf_profiles - 0.5) < EPS_BAF] = 0.5 # population_baf = np.mean(merged_baf_profiles[merged_res["new_assignment"], :], axis=0) if config["tumorprop_file"] is None else np.mean(merged_baf_profiles[merged_res["new_assignment"][single_tumor_prop > config["tumorprop_threshold"]], :], axis=0) @@ -171,199 +357,607 @@ def main(configuration_file): # adding RDR information if not config["bafonly"]: # select normal spots - if (config["normalidx_file"] is None) and (config["tumorprop_file"] is None): + if (config["normalidx_file"] is None) and ( + config["tumorprop_file"] is None + ): EPS_BAF = 0.05 PERCENT_NORMAL = 40 - vec_stds = np.std(np.log1p(copy_single_X_rdr), axis=0) # TBD: whether to smooth by multiplying smooth_mat - id_nearnormal_clone = np.argmin(np.sum( np.maximum(np.abs(merged_baf_profiles - 0.5)-EPS_BAF, 0), axis=1)) + vec_stds = np.std( + np.log1p(copy_single_X_rdr), axis=0 + ) # TBD: whether to smooth by multiplying smooth_mat + id_nearnormal_clone = np.argmin( + np.sum( + np.maximum(np.abs(merged_baf_profiles - 0.5) - EPS_BAF, 0), + axis=1, + ) + ) while True: - stdthreshold = np.percentile(vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], PERCENT_NORMAL) - normal_candidate = (vec_stds < stdthreshold) & (merged_res["new_assignment"] == id_nearnormal_clone) - if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200 or PERCENT_NORMAL == 100: + stdthreshold = np.percentile( + vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], + PERCENT_NORMAL, + ) + normal_candidate = (vec_stds < stdthreshold) & ( + merged_res["new_assignment"] == id_nearnormal_clone + ) + if ( + np.sum(copy_single_X_rdr[:, (normal_candidate == True)]) + > single_X.shape[0] * 200 + or PERCENT_NORMAL == 100 + ): break PERCENT_NORMAL += 10 # copy_single_X_rdr, _ = filter_de_genes(exp_counts, x_gene_list, normal_candidate) - copy_single_X_rdr, _ = filter_de_genes_tri(exp_counts, x_gene_list, normal_candidate, sample_list=sample_list, sample_ids=sample_ids) + copy_single_X_rdr, _ = filter_de_genes_tri( + exp_counts, + x_gene_list, + normal_candidate, + sample_list=sample_list, + sample_ids=sample_ids, + ) MIN_NORMAL_COUNT_PERBIN = 20 - bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN )[0] - rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) + bidx_inconfident = np.where( + np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1) + < MIN_NORMAL_COUNT_PERBIN + )[0] + rdr_normal = np.sum( + copy_single_X_rdr[:, (normal_candidate == True)], axis=1 + ) rdr_normal[bidx_inconfident] = 0 rdr_normal = rdr_normal / np.sum(rdr_normal) - copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin. - copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1) - pd.Series(barcodes[normal_candidate==True].index).to_csv(f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False) + copy_single_X_rdr[bidx_inconfident, :] = ( + 0 # avoid ill-defined distributions if normal has 0 count in that bin. + ) + copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum( + copy_single_X_rdr, axis=0 + ).reshape(1, -1) + pd.Series(barcodes[normal_candidate == True].index).to_csv( + f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False + ) # index_normal = np.where(normal_candidate)[0] - sorted_chr_pos = list(zip(df_bininfo.CHR.values, df_bininfo.START.values)) - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, _, x_gene_list, index_remaining = bin_selection_basedon_normal(single_X, \ - single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos, x_gene_list, config["nu"], config["logphase_shift"], index_normal) + sorted_chr_pos = list( + zip(df_bininfo.CHR.values, df_bininfo.START.values) + ) + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + sorted_chr_pos, + _, + x_gene_list, + index_remaining, + ) = bin_selection_basedon_normal( + single_X, + single_base_nb_mean, + single_total_bb_RD, + sorted_chr_pos, + sorted_chr_pos, + x_gene_list, + config["nu"], + config["logphase_shift"], + index_normal, + ) assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0] df_bininfo = df_bininfo.iloc[index_remaining, :] copy_single_X_rdr = copy_single_X_rdr[index_remaining, :] copy_single_base_nb_mean = copy_single_base_nb_mean[index_remaining, :] - elif (not config["normalidx_file"] is None): + elif not config["normalidx_file"] is None: # single_base_nb_mean has already been added in loading data step. if not config["tumorprop_file"] is None: - logger.warning(f"Mixed sources of information for normal spots! Using {config['normalidx_file']}") - + logger.warning( + f"Mixed sources of information for normal spots! Using {config['normalidx_file']}" + ) + # adding back RDR signal - single_X[:,0,:] = copy_single_X_rdr + single_X[:, 0, :] = copy_single_X_rdr single_base_nb_mean = copy_single_base_nb_mean n_obs = single_X.shape[0] # save binned data - np.savez(f"{outdir}/binned_data.npz", lengths=lengths, single_X=single_X, single_base_nb_mean=single_base_nb_mean, single_total_bb_RD=single_total_bb_RD, log_sitewise_transmat=log_sitewise_transmat, single_tumor_prop=(None if config["tumorprop_file"] is None else single_tumor_prop)) + np.savez( + f"{outdir}/binned_data.npz", + lengths=lengths, + single_X=single_X, + single_base_nb_mean=single_base_nb_mean, + single_total_bb_RD=single_total_bb_RD, + log_sitewise_transmat=log_sitewise_transmat, + single_tumor_prop=( + None if config["tumorprop_file"] is None else single_tumor_prop + ), + ) # run HMRF on each clone individually to further split BAF clone by RDR+BAF signal for bafc in range(n_baf_clones): prefix = f"clone{bafc}" idx_spots = np.where(merged_baf_assignment == bafc)[0] - if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: # put a minimum B allele read count on pseudobulk to split clones + if ( + np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20 + ): # put a minimum B allele read count on pseudobulk to split clones continue # initialize clone if config["tumorprop_file"] is None: - initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], min(len(idx_spots),config['n_clones_rdr']), random_state=r_hmrf_initialization) + initial_clone_index = rectangle_initialize_initial_clone( + coords[idx_spots], + min(len(idx_spots), config["n_clones_rdr"]), + random_state=r_hmrf_initialization, + ) else: - initial_clone_index = rectangle_initialize_initial_clone_mix(coords[idx_spots], min(len(idx_spots),config['n_clones_rdr']), single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization) - if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz").exists(): + initial_clone_index = rectangle_initialize_initial_clone_mix( + coords[idx_spots], + min(len(idx_spots), config["n_clones_rdr"]), + single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + random_state=r_hmrf_initialization, + ) + if not Path( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz" + ).exists(): initial_assignment = np.zeros(len(idx_spots), dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment} - np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", **allres) - + allres = { + "barcodes": barcodes[idx_spots], + "num_iterations": 0, + "round-1_assignment": initial_assignment, + } + np.savez( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", + **allres, + ) + # HMRF + HMM using RDR information copy_slice_sample_ids = copy.copy(sample_ids[idx_spots]) if config["tumorprop_file"] is None: - hmrf_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"]) + hmrf_concatenate_pipeline( + outdir, + prefix, + single_X[:, :, idx_spots], + lengths, + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], + adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], + sample_ids=copy_slice_sample_ids, + max_iter_outer=10, + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + ) else: - hmrfmix_concatenate_pipeline(outdir, prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], single_tumor_prop[idx_spots], initial_clone_index, n_states=config["n_states"], \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=config["max_iter"], tol=config["tol"], spatial_weight=config["spatial_weight"], tumorprop_threshold=config["tumorprop_threshold"]) + hmrfmix_concatenate_pipeline( + outdir, + prefix, + single_X[:, :, idx_spots], + lengths, + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + single_tumor_prop[idx_spots], + initial_clone_index, + n_states=config["n_states"], + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], + adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], + sample_ids=copy_slice_sample_ids, + max_iter_outer=10, + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=config["max_iter"], + tol=config["tol"], + spatial_weight=config["spatial_weight"], + tumorprop_threshold=config["tumorprop_threshold"], + ) ##### combine results across clones ##### - res_combine = {"prev_assignment":-1 * np.ones(single_X.shape[2], dtype=int)} + res_combine = { + "prev_assignment": -1 * np.ones(single_X.shape[2], dtype=int) + } offset_clone = 0 for bafc in range(n_baf_clones): prefix = f"clone{bafc}" - if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz").exists(): + if not Path( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz" + ).exists(): # we skipped the BAF clone in the previous step because of low SNP-covering UMI conuts. continue - allres = dict( np.load(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", allow_pickle=True) ) + allres = dict( + np.load( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) r = allres["num_iterations"] - 1 - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} - idx_spots = np.where(barcodes.isin( allres["barcodes"] ))[0] + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } + idx_spots = np.where(barcodes.isin(allres["barcodes"]))[0] if len(np.unique(res["new_assignment"])) == 1: n_merged_clones = 1 c = res["new_assignment"][0] merged_res = copy.copy(res) merged_res["new_assignment"] = np.zeros(len(idx_spots), dtype=int) try: - log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((2*config["n_states"], n_obs, 1)) + log_gamma = res["log_gamma"][ + :, (c * n_obs) : (c * n_obs + n_obs) + ].reshape((2 * config["n_states"], n_obs, 1)) except: - log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((config["n_states"], n_obs, 1)) - pred_cnv = res["pred_cnv"][ (c*n_obs):(c*n_obs+n_obs) ].reshape((-1,1)) + log_gamma = res["log_gamma"][ + :, (c * n_obs) : (c * n_obs + n_obs) + ].reshape((config["n_states"], n_obs, 1)) + pred_cnv = res["pred_cnv"][ + (c * n_obs) : (c * n_obs + n_obs) + ].reshape((-1, 1)) else: if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(config['n_clones_rdr'])]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(res["new_assignment"] == c)[0] + for c in range(config["n_clones_rdr"]) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(config['n_clones_rdr'])], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"]) - tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) - merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=config["np_threshold"], minlength=config["np_eventminlen"], params="smp", tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(res["new_assignment"] == c)[0] + for c in range(config["n_clones_rdr"]) + ], + single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + ) + ) + tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + merging_groups, merged_res = ( + similarity_components_rdrbaf_neymanpearson( + X, + base_nb_mean, + total_bb_RD, + res, + threshold=config["np_threshold"], + minlength=config["np_eventminlen"], + params="smp", + tumor_prop=tumor_prop, + hmmclass=hmm_nophasing_v2, + ) + ) print(f"part {bafc} merging_groups: {merging_groups}") # if config["tumorprop_file"] is None: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD[:, idx_spots], + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] + * n_obs, + ) else: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], merged_res, single_total_bb_RD[:,idx_spots], min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*n_obs, single_tumor_prop=single_tumor_prop[idx_spots]) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + merged_res, + single_total_bb_RD[:, idx_spots], + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] + * n_obs, + single_tumor_prop=single_tumor_prop[idx_spots], + ) # compute posterior using the newly merged pseudobulk n_merged_clones = len(merging_groups) tmp = copy.copy(merged_res["new_assignment"]) if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(merged_res["new_assignment"] == c)[0] + for c in range(n_merged_clones) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)], single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"]) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(merged_res["new_assignment"] == c)[0] + for c in range(n_merged_clones) + ], + single_tumor_prop[idx_spots], + threshold=config["tumorprop_threshold"], + ) + ) # - merged_res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), config["n_states"], \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, \ - hmmclass=hmm_nophasing_v2, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, init_log_mu=res["new_log_mu"], init_p_binom=res["new_p_binom"], init_alphas=res["new_alphas"], init_taus=res["new_taus"], max_iter=config["max_iter"], tol=config["tol"], lambd=np.sum(base_nb_mean,axis=1)/np.sum(base_nb_mean), sample_length=np.ones(X.shape[2],dtype=int)*X.shape[0]) + merged_res = pipeline_baum_welch( + None, + np.vstack( + [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")] + ).T.reshape(-1, 2, 1), + np.tile(lengths, X.shape[2]), + config["n_states"], + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + np.tile(log_sitewise_transmat, X.shape[2]), + ( + np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + if not tumor_prop is None + else None + ), + hmmclass=hmm_nophasing_v2, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + init_log_mu=res["new_log_mu"], + init_p_binom=res["new_p_binom"], + init_alphas=res["new_alphas"], + init_taus=res["new_taus"], + max_iter=config["max_iter"], + tol=config["tol"], + lambd=np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean), + sample_length=np.ones(X.shape[2], dtype=int) * X.shape[0], + ) merged_res["new_assignment"] = copy.copy(tmp) - merged_res = combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, merged_res, params="smp", tumor_prop=np.repeat(tumor_prop, X.shape[0]).reshape(-1,1) if not tumor_prop is None else None, hmmclass=hmm_nophasing_v2, merge_threshold=0.1) - log_gamma = np.stack([ merged_res["log_gamma"][:,(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ], axis=-1) - pred_cnv = np.vstack([ merged_res["pred_cnv"][(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ]).T - + merged_res = combine_similar_states_across_clones( + X, + base_nb_mean, + total_bb_RD, + merged_res, + params="smp", + tumor_prop=( + np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + if not tumor_prop is None + else None + ), + hmmclass=hmm_nophasing_v2, + merge_threshold=0.1, + ) + log_gamma = np.stack( + [ + merged_res["log_gamma"][ + :, (c * n_obs) : (c * n_obs + n_obs) + ] + for c in range(n_merged_clones) + ], + axis=-1, + ) + pred_cnv = np.vstack( + [ + merged_res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] + for c in range(n_merged_clones) + ] + ).T + # add to res_combine if len(res_combine) == 1: - res_combine.update({"new_log_mu":np.hstack([ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([ merged_res["new_alphas"] ] * n_merged_clones), \ - "new_p_binom":np.hstack([ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([ merged_res["new_taus"] ] * n_merged_clones), \ - "log_gamma":log_gamma, "pred_cnv":pred_cnv}) + res_combine.update( + { + "new_log_mu": np.hstack( + [merged_res["new_log_mu"]] * n_merged_clones + ), + "new_alphas": np.hstack( + [merged_res["new_alphas"]] * n_merged_clones + ), + "new_p_binom": np.hstack( + [merged_res["new_p_binom"]] * n_merged_clones + ), + "new_taus": np.hstack( + [merged_res["new_taus"]] * n_merged_clones + ), + "log_gamma": log_gamma, + "pred_cnv": pred_cnv, + } + ) else: - res_combine.update({"new_log_mu":np.hstack([res_combine["new_log_mu"]] + [ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([res_combine["new_alphas"]] + [ merged_res["new_alphas"] ] * n_merged_clones), \ - "new_p_binom":np.hstack([res_combine["new_p_binom"]] + [ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([res_combine["new_taus"]] + [ merged_res["new_taus"] ] * n_merged_clones), \ - "log_gamma":np.dstack([res_combine["log_gamma"], log_gamma ]), "pred_cnv":np.hstack([res_combine["pred_cnv"], pred_cnv])}) - res_combine["prev_assignment"][idx_spots] = merged_res["new_assignment"] + offset_clone + res_combine.update( + { + "new_log_mu": np.hstack( + [res_combine["new_log_mu"]] + + [merged_res["new_log_mu"]] * n_merged_clones + ), + "new_alphas": np.hstack( + [res_combine["new_alphas"]] + + [merged_res["new_alphas"]] * n_merged_clones + ), + "new_p_binom": np.hstack( + [res_combine["new_p_binom"]] + + [merged_res["new_p_binom"]] * n_merged_clones + ), + "new_taus": np.hstack( + [res_combine["new_taus"]] + + [merged_res["new_taus"]] * n_merged_clones + ), + "log_gamma": np.dstack( + [res_combine["log_gamma"], log_gamma] + ), + "pred_cnv": np.hstack([res_combine["pred_cnv"], pred_cnv]), + } + ) + res_combine["prev_assignment"][idx_spots] = ( + merged_res["new_assignment"] + offset_clone + ) offset_clone += n_merged_clones # assign un-assigned spots to the clone with smallest number of spots unassigned_spots = np.where(res_combine["prev_assignment"] == -1)[0] - res_combine["prev_assignment"][unassigned_spots] = np.argmin(np.bincount(res_combine["prev_assignment"][res_combine["prev_assignment"]>=0])) + res_combine["prev_assignment"][unassigned_spots] = np.argmin( + np.bincount( + res_combine["prev_assignment"][res_combine["prev_assignment"] >= 0] + ) + ) # temp: make dispersions the same across all clones - res_combine["new_alphas"][:,:] = np.max(res_combine["new_alphas"]) - res_combine["new_taus"][:,:] = np.min(res_combine["new_taus"]) + res_combine["new_alphas"][:, :] = np.max(res_combine["new_alphas"]) + res_combine["new_taus"][:, :] = np.min(res_combine["new_taus"]) # end temp n_final_clones = len(np.unique(res_combine["prev_assignment"])) # compute HMRF log likelihood log_persample_weights = np.zeros((n_final_clones, len(sample_list))) for sidx in range(len(sample_list)): index = np.where(sample_ids == sidx)[0] - this_persample_weight = np.bincount(res_combine["prev_assignment"][index], minlength=n_final_clones) / len(index) - log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) - log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) + this_persample_weight = np.bincount( + res_combine["prev_assignment"][index], minlength=n_final_clones + ) / len(index) + log_persample_weights[:, sidx] = np.where( + this_persample_weight > 0, np.log(this_persample_weight), -50 + ) + log_persample_weights[:, sidx] = log_persample_weights[ + :, sidx + ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) # final re-assignment across all clones using estimated RDR + BAF if config["tumorprop_file"] is None: if config["nodepotential"] == "max": - pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T - new_assignment, single_llf, total_llf, posterior = aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, pred, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + pred = np.vstack( + [ + np.argmax(res_combine["log_gamma"][:, :, c], axis=0) + for c in range(res_combine["log_gamma"].shape[2]) + ] + ).T + new_assignment, single_llf, total_llf, posterior = ( + aggr_hmrf_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res_combine, + pred, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) elif config["nodepotential"] == "weighted_sum": - new_assignment, single_llf, total_llf, posterior = hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res_combine, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + new_assignment, single_llf, total_llf, posterior = ( + hmrf_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res_combine, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) else: if config["nodepotential"] == "max": - pred = np.vstack([ np.argmax(res_combine["log_gamma"][:,:,c], axis=0) for c in range(res_combine["log_gamma"].shape[2]) ]).T - new_assignment, single_llf, total_llf, posterior = aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, pred, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + pred = np.vstack( + [ + np.argmax(res_combine["log_gamma"][:, :, c], axis=0) + for c in range(res_combine["log_gamma"].shape[2]) + ] + ).T + new_assignment, single_llf, total_llf, posterior = ( + aggr_hmrfmix_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res_combine, + pred, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) elif config["nodepotential"] == "weighted_sum": - new_assignment, single_llf, total_llf, posterior = hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res_combine, \ - smooth_mat, adjacency_mat, res_combine["prev_assignment"], copy.copy(sample_ids), log_persample_weights, spatial_weight=config["spatial_weight"], hmmclass=hmm_nophasing_v2, return_posterior=True) + new_assignment, single_llf, total_llf, posterior = ( + hmrfmix_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res_combine, + smooth_mat, + adjacency_mat, + res_combine["prev_assignment"], + copy.copy(sample_ids), + log_persample_weights, + spatial_weight=config["spatial_weight"], + hmmclass=hmm_nophasing_v2, + return_posterior=True, + ) + ) res_combine["total_llf"] = total_llf res_combine["new_assignment"] = new_assignment # res_combine = dict(np.load(f"{outdir}/original_rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True)) # posterior = np.load(f"{outdir}/original_posterior_clone_probability.npy") # re-order clones such that normal clones are always clone 0 - res_combine, posterior = reorder_results(res_combine, posterior, single_tumor_prop) + res_combine, posterior = reorder_results( + res_combine, posterior, single_tumor_prop + ) # save results - np.savez(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine) + np.savez( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + **res_combine, + ) np.save(f"{outdir}/posterior_clone_probability.npy", posterior) - + ##### infer integer copy ##### - res_combine = dict(np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True)) + res_combine = dict( + np.load( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) final_clone_ids = np.sort(np.unique(res_combine["new_assignment"])) nonempty_clone_ids = copy.copy(final_clone_ids) # add clone 0 as normal clone if it doesn't appear in final_clone_ids @@ -371,7 +965,7 @@ def main(configuration_file): final_clone_ids = np.append(0, final_clone_ids) # chr position medfix = ["", "_diploid", "_triploid", "_tetraploid"] - for o,max_medploidy in enumerate([None, 2, 3, 4]): + for o, max_medploidy in enumerate([None, 2, 3, 4]): # A/B copy number per bin allele_specific_copy = [] # A/B copy number per state @@ -379,41 +973,139 @@ def main(configuration_file): df_genelevel_cnv = None if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res_combine["new_assignment"] == cid)[0] + for cid in final_clone_ids + ], + ) else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==cid)[0] for cid in final_clone_ids], single_tumor_prop, threshold=config["tumorprop_threshold"]) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res_combine["new_assignment"] == cid)[0] + for cid in final_clone_ids + ], + single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) + ) - finding_distate_failed=False + finding_distate_failed = False for s, cid in enumerate(final_clone_ids): - if np.sum(base_nb_mean[:,s]) == 0: + if np.sum(base_nb_mean[:, s]) == 0: continue # adjust log_mu such that sum_bin lambda * np.exp(log_mu) = 1 - lambd = base_nb_mean[:,s] / np.sum(base_nb_mean[:,s]) - this_pred_cnv = res_combine["pred_cnv"][:,s] - adjusted_log_mu = np.log( np.exp(res_combine["new_log_mu"][:,s]) / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv,s]) * lambd) ) + lambd = base_nb_mean[:, s] / np.sum(base_nb_mean[:, s]) + this_pred_cnv = res_combine["pred_cnv"][:, s] + adjusted_log_mu = np.log( + np.exp(res_combine["new_log_mu"][:, s]) + / np.sum( + np.exp(res_combine["new_log_mu"][this_pred_cnv, s]) * lambd + ) + ) if not max_medploidy is None: - best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, max_medploidy=max_medploidy) + best_integer_copies, _ = ( + hill_climbing_integer_copynumber_oneclone( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + max_medploidy=max_medploidy, + ) + ) else: try: - best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"]) + best_integer_copies, _ = ( + hill_climbing_integer_copynumber_fixdiploid( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + nonbalance_bafdist=config["nonbalance_bafdist"], + nondiploid_rdrdist=config["nondiploid_rdrdist"], + ) + ) except: try: - best_integer_copies, _ = hill_climbing_integer_copynumber_fixdiploid(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, nonbalance_bafdist=config["nonbalance_bafdist"], nondiploid_rdrdist=config["nondiploid_rdrdist"], min_prop_threshold=0.05) + best_integer_copies, _ = ( + hill_climbing_integer_copynumber_fixdiploid( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + nonbalance_bafdist=config["nonbalance_bafdist"], + nondiploid_rdrdist=config["nondiploid_rdrdist"], + min_prop_threshold=0.05, + ) + ) except: finding_distate_failed = True continue - print(f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}") - - allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 0].reshape(1,-1), index=[f"clone{cid} A"], columns=np.arange(n_obs) ) ) - allele_specific_copy.append( pd.DataFrame( best_integer_copies[res_combine["pred_cnv"][:,s], 1].reshape(1,-1), index=[f"clone{cid} B"], columns=np.arange(n_obs) ) ) + print( + f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}" + ) + + allele_specific_copy.append( + pd.DataFrame( + best_integer_copies[ + res_combine["pred_cnv"][:, s], 0 + ].reshape(1, -1), + index=[f"clone{cid} A"], + columns=np.arange(n_obs), + ) + ) + allele_specific_copy.append( + pd.DataFrame( + best_integer_copies[ + res_combine["pred_cnv"][:, s], 1 + ].reshape(1, -1), + index=[f"clone{cid} B"], + columns=np.arange(n_obs), + ) + ) # - state_cnv.append( pd.DataFrame( res_combine["new_log_mu"][:,s].reshape(-1,1), columns=[f"clone{cid} logmu"], index=np.arange(config['n_states']) ) ) - state_cnv.append( pd.DataFrame( res_combine["new_p_binom"][:,s].reshape(-1,1), columns=[f"clone{cid} p"], index=np.arange(config['n_states']) ) ) - state_cnv.append( pd.DataFrame( best_integer_copies[:,0].reshape(-1,1), columns=[f"clone{cid} A"], index=np.arange(config['n_states']) ) ) - state_cnv.append( pd.DataFrame( best_integer_copies[:,1].reshape(-1,1), columns=[f"clone{cid} B"], index=np.arange(config['n_states']) ) ) + state_cnv.append( + pd.DataFrame( + res_combine["new_log_mu"][:, s].reshape(-1, 1), + columns=[f"clone{cid} logmu"], + index=np.arange(config["n_states"]), + ) + ) + state_cnv.append( + pd.DataFrame( + res_combine["new_p_binom"][:, s].reshape(-1, 1), + columns=[f"clone{cid} p"], + index=np.arange(config["n_states"]), + ) + ) + state_cnv.append( + pd.DataFrame( + best_integer_copies[:, 0].reshape(-1, 1), + columns=[f"clone{cid} A"], + index=np.arange(config["n_states"]), + ) + ) + state_cnv.append( + pd.DataFrame( + best_integer_copies[:, 1].reshape(-1, 1), + columns=[f"clone{cid} B"], + index=np.arange(config["n_states"]), + ) + ) # - tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list) + tmpdf = get_genelevel_cnv_oneclone( + best_integer_copies[res_combine["pred_cnv"][:, s], 0], + best_integer_copies[res_combine["pred_cnv"][:, s], 1], + x_gene_list, + ) tmpdf.columns = [f"clone{s} A", f"clone{s} B"] if df_genelevel_cnv is None: df_genelevel_cnv = copy.copy(tmpdf) @@ -424,24 +1116,62 @@ def main(configuration_file): continue # output gene-level copy number - df_genelevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t") + df_genelevel_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_genelevel.tsv", + header=True, + index=True, + sep="\t", + ) # output segment-level copy number allele_specific_copy = pd.concat(allele_specific_copy) - df_seglevel_cnv = pd.DataFrame({"CHR":df_bininfo.CHR.values, "START":df_bininfo.START.values, "END":df_bininfo.END.values }) - df_seglevel_cnv = df_seglevel_cnv.join( allele_specific_copy.T ) - df_seglevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t") + df_seglevel_cnv = pd.DataFrame( + { + "CHR": df_bininfo.CHR.values, + "START": df_bininfo.START.values, + "END": df_bininfo.END.values, + } + ) + df_seglevel_cnv = df_seglevel_cnv.join(allele_specific_copy.T) + df_seglevel_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_seglevel.tsv", + header=True, + index=False, + sep="\t", + ) # output per-state copy number - state_cnv = functools.reduce(lambda left,right: pd.merge(left,right, left_index=True, right_index=True, how='inner'), state_cnv) - state_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t") - # summarize to cna events - df_event = summary_events(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz") - df_event.to_csv(f"{outdir}/cnv{medfix[o]}_event.tsv", header=True, index=False, sep="\t") - + state_cnv = functools.reduce( + lambda left, right: pd.merge( + left, right, left_index=True, right_index=True, how="inner" + ), + state_cnv, + ) + state_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_perstate.tsv", + header=True, + index=False, + sep="\t", + ) + # summarize to cna events + df_event = summary_events( + f"{outdir}/cnv{medfix[o]}_seglevel.tsv", + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + ) + df_event.to_csv( + f"{outdir}/cnv{medfix[o]}_event.tsv", + header=True, + index=False, + sep="\t", + ) + ##### output clone label ##### - df_clone_label = pd.DataFrame({"clone_label":res_combine["new_assignment"]}, index=barcodes) + df_clone_label = pd.DataFrame( + {"clone_label": res_combine["new_assignment"]}, index=barcodes + ) if not config["tumorprop_file"] is None: df_clone_label["tumor_proportion"] = single_tumor_prop - df_clone_label.to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t") + df_clone_label.to_csv( + f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t" + ) ##### plotting ##### # make a directory for plots @@ -450,44 +1180,143 @@ def main(configuration_file): # plot RDR and BAF cn_file = f"{outdir}/cnv_diploid_seglevel.tsv" - fig = plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=30, palette="tab10") - fig.savefig(f"{outdir}/plots/rdr_baf_defaultcolor.pdf", transparent=True, bbox_inches="tight") + fig = plot_rdr_baf( + configuration_file, + r_hmrf_initialization, + cn_file, + clone_ids=None, + remove_xticks=True, + rdr_ylim=5, + chrtext_shift=-0.3, + base_height=3.2, + pointsize=30, + palette="tab10", + ) + fig.savefig( + f"{outdir}/plots/rdr_baf_defaultcolor.pdf", + transparent=True, + bbox_inches="tight", + ) # plot allele-specific copy number - for o,max_medploidy in enumerate([None, 2, 3, 4]): + for o, max_medploidy in enumerate([None, 2, 3, 4]): cn_file = f"{outdir}/cnv{medfix[o]}_seglevel.tsv" if not Path(cn_file).exists(): continue df_cnv = pd.read_csv(cn_file, header=0, sep="\t") df_cnv = expand_df_cnv(df_cnv) - fig, axes = plt.subplots(1, 1, figsize=(15, 0.9*len(final_clone_ids) + 0.6), dpi=200, facecolor="white") - axes = plot_acn_from_df(df_cnv, axes, add_chrbar=True, add_arrow=True, chrbar_thickness=0.4/(0.6*len(final_clone_ids) + 0.4), add_legend=True, remove_xticks=True) + fig, axes = plt.subplots( + 1, + 1, + figsize=(15, 0.9 * len(final_clone_ids) + 0.6), + dpi=200, + facecolor="white", + ) + axes = plot_acn_from_df( + df_cnv, + axes, + add_chrbar=True, + add_arrow=True, + chrbar_thickness=0.4 / (0.6 * len(final_clone_ids) + 0.4), + add_legend=True, + remove_xticks=True, + ) fig.tight_layout() - fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}.pdf", transparent=True, bbox_inches="tight") + fig.savefig( + f"{outdir}/plots/acn_genome{medfix[o]}.pdf", + transparent=True, + bbox_inches="tight", + ) # additionally plot the allele-specific copy number per region if not config["supervision_clone_file"] is None: - fig, axes = plt.subplots(1, 1, figsize=(15, 0.6*len(unique_clone_ids) + 0.4), dpi=200, facecolor="white") + fig, axes = plt.subplots( + 1, + 1, + figsize=(15, 0.6 * len(unique_clone_ids) + 0.4), + dpi=200, + facecolor="white", + ) merged_df_cnv = pd.read_csv(cn_file, header=0, sep="\t") df_cnv = merged_df_cnv[["CHR", "START", "END"]] - df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} A":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} A"] for i,x in enumerate(unique_clone_ids)}) ) - df_cnv = df_cnv.join( pd.DataFrame({f"clone{x} B":merged_df_cnv[f"clone{res_combine['new_assignment'][i]} B"] for i,x in enumerate(unique_clone_ids)}) ) + df_cnv = df_cnv.join( + pd.DataFrame( + { + f"clone{x} A": merged_df_cnv[ + f"clone{res_combine['new_assignment'][i]} A" + ] + for i, x in enumerate(unique_clone_ids) + } + ) + ) + df_cnv = df_cnv.join( + pd.DataFrame( + { + f"clone{x} B": merged_df_cnv[ + f"clone{res_combine['new_assignment'][i]} B" + ] + for i, x in enumerate(unique_clone_ids) + } + ) + ) df_cnv = expand_df_cnv(df_cnv) - clone_ids = np.concatenate([ unique_clone_ids[res_combine["new_assignment"]==c].astype(str) for c in final_clone_ids ]) - axes = plot_acn_from_df(df_cnv, axes, clone_ids=clone_ids, clone_names=[f"region {x}" for x in clone_ids], add_chrbar=True, add_arrow=False, chrbar_thickness=0.4/(0.6*len(unique_clone_ids) + 0.4), add_legend=True, remove_xticks=True) + clone_ids = np.concatenate( + [ + unique_clone_ids[res_combine["new_assignment"] == c].astype( + str + ) + for c in final_clone_ids + ] + ) + axes = plot_acn_from_df( + df_cnv, + axes, + clone_ids=clone_ids, + clone_names=[f"region {x}" for x in clone_ids], + add_chrbar=True, + add_arrow=False, + chrbar_thickness=0.4 / (0.6 * len(unique_clone_ids) + 0.4), + add_legend=True, + remove_xticks=True, + ) fig.tight_layout() - fig.savefig(f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf", transparent=True, bbox_inches="tight") + fig.savefig( + f"{outdir}/plots/acn_genome{medfix[o]}_per_region.pdf", + transparent=True, + bbox_inches="tight", + ) # plot clones in space if not config["supervision_clone_file"] is None: before_assignments = pd.Series([None] * before_coords.shape[0]) - for i,c in enumerate(unique_clone_ids): - before_assignments.iloc[before_df_clones.clone_id.isin([c])] = f"clone {res_combine['new_assignment'][i]}" - fig = plot_clones_in_space(before_coords, before_assignments, sample_list, before_sample_ids, palette="Set2", labels=unique_clone_ids, label_coords=coords, label_sample_ids=sample_ids) - fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight") + for i, c in enumerate(unique_clone_ids): + before_assignments.iloc[before_df_clones.clone_id.isin([c])] = ( + f"clone {res_combine['new_assignment'][i]}" + ) + fig = plot_clones_in_space( + before_coords, + before_assignments, + sample_list, + before_sample_ids, + palette="Set2", + labels=unique_clone_ids, + label_coords=coords, + label_sample_ids=sample_ids, + ) + fig.savefig( + f"{outdir}/plots/clone_spatial.pdf", + transparent=True, + bbox_inches="tight", + ) else: - assignment = pd.Series([f"clone {x}" for x in res_combine["new_assignment"]]) + assignment = pd.Series( + [f"clone {x}" for x in res_combine["new_assignment"]] + ) fig = plot_clones_in_space(coords, assignment, axes, palette="Set2") - fig.savefig(f"{outdir}/plots/clone_spatial.pdf", transparent=True, bbox_inches="tight") + fig.savefig( + f"{outdir}/plots/clone_spatial.pdf", + transparent=True, + bbox_inches="tight", + ) if __name__ == "__main__": if len(sys.argv) > 1: - main(sys.argv[1]) \ No newline at end of file + main(sys.argv[1]) diff --git a/src/calicost/estimate_tumor_proportion.py b/src/calicost/estimate_tumor_proportion.py index 06d4caa..e61a795 100644 --- a/src/calicost/estimate_tumor_proportion.py +++ b/src/calicost/estimate_tumor_proportion.py @@ -4,7 +4,12 @@ import pandas as pd from pathlib import Path import logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) logger = logging.getLogger() import copy import functools @@ -22,99 +27,234 @@ def main(configuration_file): except: config = read_joint_configuration_file(configuration_file) - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ - barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) - - single_base_nb_mean[:,:] = 0 + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + df_bininfo, + df_gene_snp, + barcodes, + coords, + single_tumor_prop, + sample_list, + sample_ids, + adjacency_mat, + smooth_mat, + exp_counts, + ) = run_parse_n_load(config) + + single_base_nb_mean[:, :] = 0 n_states_for_tumorprop = 5 n_clones_for_tumorprop = 3 - n_rdrclones_for_tumorprop = 3 #2 + n_rdrclones_for_tumorprop = 3 # 2 max_outer_iter_for_tumorprop = 10 max_iter_for_tumorprop = 20 MIN_PROP_UNCERTAINTY = 0.05 - initial_clone_index = rectangle_initialize_initial_clone(coords, n_clones_for_tumorprop, random_state=0) + initial_clone_index = rectangle_initialize_initial_clone( + coords, n_clones_for_tumorprop, random_state=0 + ) # save clone initialization into npz file prefix = "initialhmm" - if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists(): + if not Path( + f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz" + ).exists(): initial_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"num_iterations":0, "round-1_assignment":initial_assignment} - np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres) + allres = {"num_iterations": 0, "round-1_assignment": initial_assignment} + np.savez( + f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", + **allres, + ) + + hmrf_concatenate_pipeline( + config["output_dir"], + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + n_states=n_states_for_tumorprop, + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat, + adjacency_mat=adjacency_mat, + sample_ids=sample_ids, + max_iter_outer=max_outer_iter_for_tumorprop, + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="sp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=max_iter_for_tumorprop, + tol=config["tol"], + spatial_weight=config["spatial_weight"], + ) - hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=n_states_for_tumorprop, \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=max_outer_iter_for_tumorprop, nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"]) - - res = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz") - merging_groups, merged_res = merge_by_minspots(res["new_assignment"], res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*single_X.shape[0]) + res = load_hmrf_last_iteration( + f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz" + ) + merging_groups, merged_res = merge_by_minspots( + res["new_assignment"], + res, + single_total_bb_RD, + min_spots_thresholds=config["min_spots_per_clone"], + min_umicount_thresholds=config["min_avgumi_per_clone"] * single_X.shape[0], + ) # further refine clones - combined_assignment = copy.copy(merged_res['new_assignment']) + combined_assignment = copy.copy(merged_res["new_assignment"]) offset_clone = 0 combined_p_binom = [] offset_state = 0 combined_pred_cnv = [] for bafc in range(len(merging_groups)): prefix = f"initialhmm_clone{bafc}" - idx_spots = np.where(merged_res['new_assignment'] == bafc)[0] + idx_spots = np.where(merged_res["new_assignment"] == bafc)[0] total_allele_count = np.sum(single_total_bb_RD[:, idx_spots]) - if total_allele_count < single_X.shape[0] * 50: # put a minimum B allele read count on pseudobulk to split clones + if ( + total_allele_count < single_X.shape[0] * 50 + ): # put a minimum B allele read count on pseudobulk to split clones combined_assignment[idx_spots] = offset_clone offset_clone += 1 - combined_p_binom.append(merged_res['new_p_binom']) - combined_pred_cnv.append(merged_res['pred_cnv'] + offset_state) - offset_state += merged_res['new_p_binom'].shape[0] + combined_p_binom.append(merged_res["new_p_binom"]) + combined_pred_cnv.append(merged_res["pred_cnv"] + offset_state) + offset_state += merged_res["new_p_binom"].shape[0] continue # initialize clone - initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], n_rdrclones_for_tumorprop, random_state=0) + initial_clone_index = rectangle_initialize_initial_clone( + coords[idx_spots], n_rdrclones_for_tumorprop, random_state=0 + ) # save clone initialization into npz file - if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists(): + if not Path( + f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz" + ).exists(): initial_assignment = np.zeros(len(idx_spots), dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment} - np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres) - + allres = { + "barcodes": barcodes[idx_spots], + "num_iterations": 0, + "round-1_assignment": initial_assignment, + } + np.savez( + f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", + **allres, + ) + copy_slice_sample_ids = copy.copy(sample_ids[idx_spots]) - hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=n_states_for_tumorprop, \ - log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \ - hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ - fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"]) - - cloneres = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz") - combined_assignment[idx_spots] = cloneres['new_assignment'] + offset_clone - offset_clone += np.max(cloneres['new_assignment']) + 1 - combined_p_binom.append(cloneres['new_p_binom']) - combined_pred_cnv.append(cloneres['pred_cnv'] + offset_state) - offset_state += cloneres['new_p_binom'].shape[0] + hmrf_concatenate_pipeline( + config["output_dir"], + prefix, + single_X[:, :, idx_spots], + lengths, + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + initial_clone_index, + n_states=n_states_for_tumorprop, + log_sitewise_transmat=log_sitewise_transmat, + smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], + adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], + sample_ids=copy_slice_sample_ids, + max_iter_outer=10, + nodepotential=config["nodepotential"], + hmmclass=hmm_nophasing_v2, + params="sp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + max_iter=max_iter_for_tumorprop, + tol=config["tol"], + spatial_weight=config["spatial_weight"], + ) + + cloneres = load_hmrf_last_iteration( + f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz" + ) + combined_assignment[idx_spots] = cloneres["new_assignment"] + offset_clone + offset_clone += np.max(cloneres["new_assignment"]) + 1 + combined_p_binom.append(cloneres["new_p_binom"]) + combined_pred_cnv.append(cloneres["pred_cnv"] + offset_state) + offset_state += cloneres["new_p_binom"].shape[0] combined_p_binom = np.vstack(combined_p_binom) combined_pred_cnv = np.concatenate(combined_pred_cnv) - normal_candidate = identify_normal_spots(single_X, single_total_bb_RD, merged_res['new_assignment'], merged_res['pred_cnv'], merged_res['new_p_binom'], min_count=single_X.shape[0] * 200) - loh_states, is_B_lost, rdr_values, clones_hightumor = identify_loh_per_clone(single_X, combined_assignment, combined_pred_cnv, combined_p_binom, normal_candidate, single_total_bb_RD) - assignments = pd.DataFrame({'coarse':merged_res['new_assignment'], 'combined':combined_assignment}) + normal_candidate = identify_normal_spots( + single_X, + single_total_bb_RD, + merged_res["new_assignment"], + merged_res["pred_cnv"], + merged_res["new_p_binom"], + min_count=single_X.shape[0] * 200, + ) + loh_states, is_B_lost, rdr_values, clones_hightumor = identify_loh_per_clone( + single_X, + combined_assignment, + combined_pred_cnv, + combined_p_binom, + normal_candidate, + single_total_bb_RD, + ) + assignments = pd.DataFrame( + {"coarse": merged_res["new_assignment"], "combined": combined_assignment} + ) # pool across adjacency spot to increase the UMIs covering LOH region - _, tp_smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, - across_slice_adjacency_mat=None, construct_adjacency_method=config['construct_adjacency_method'], - maxspots_pooling=7, construct_adjacency_w=config['construct_adjacency_w']) - single_tumor_prop, _ = estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, combined_pred_cnv, loh_states, is_B_lost, rdr_values, clones_hightumor, smooth_mat=tp_smooth_mat) + _, tp_smooth_mat = multislice_adjacency( + sample_ids, + sample_list, + coords, + single_total_bb_RD, + exp_counts, + across_slice_adjacency_mat=None, + construct_adjacency_method=config["construct_adjacency_method"], + maxspots_pooling=7, + construct_adjacency_w=config["construct_adjacency_w"], + ) + single_tumor_prop, _ = estimator_tumor_proportion( + single_X, + single_total_bb_RD, + assignments, + combined_pred_cnv, + loh_states, + is_B_lost, + rdr_values, + clones_hightumor, + smooth_mat=tp_smooth_mat, + ) # post-processing to remove negative tumor proportions - single_tumor_prop = np.where(single_tumor_prop < MIN_PROP_UNCERTAINTY, MIN_PROP_UNCERTAINTY, single_tumor_prop) + single_tumor_prop = np.where( + single_tumor_prop < MIN_PROP_UNCERTAINTY, + MIN_PROP_UNCERTAINTY, + single_tumor_prop, + ) single_tumor_prop[normal_candidate] = 0 # save single_tumor_prop to file - pd.DataFrame({"Tumor":single_tumor_prop}, index=barcodes).to_csv(f"{config['output_dir']}/loh_estimator_tumor_prop.tsv", header=True, sep="\t") + pd.DataFrame({"Tumor": single_tumor_prop}, index=barcodes).to_csv( + f"{config['output_dir']}/loh_estimator_tumor_prop.tsv", header=True, sep="\t" + ) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) + parser.add_argument( + "-c", + "--configfile", + help="configuration file of CalicoST", + required=True, + type=str, + ) args = parser.parse_args() - main(args.configfile) \ No newline at end of file + main(args.configfile) diff --git a/src/calicost/find_integer_copynumber.py b/src/calicost/find_integer_copynumber.py index b1e41f6..020065b 100644 --- a/src/calicost/find_integer_copynumber.py +++ b/src/calicost/find_integer_copynumber.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import scipy + # import gurobipy as gp # from gurobipy import GRB import copy @@ -70,23 +71,44 @@ # return best_integer_copies, best_obj -def find_diploid_balanced_state(new_log_mu, new_p_binom, pred_cnv, min_prop_threshold, EPS_BAF): +def find_diploid_balanced_state( + new_log_mu, new_p_binom, pred_cnv, min_prop_threshold, EPS_BAF +): n_states = len(new_log_mu) # find candidate diploid balanced state under the criteria that (1) #bins in that state > 0.1 * total #bins and (2) BAF is close to 0.5 by EPS_BAF distance - candidate = np.where( (np.bincount(pred_cnv, minlength=n_states) >= min_prop_threshold*len(pred_cnv)) & (np.abs(new_p_binom - 0.5) <= EPS_BAF) )[0] + candidate = np.where( + ( + np.bincount(pred_cnv, minlength=n_states) + >= min_prop_threshold * len(pred_cnv) + ) + & (np.abs(new_p_binom - 0.5) <= EPS_BAF) + )[0] if len(candidate) == 0: raise ValueError("No candidate diploid balanced state found!") else: # the diploid balanced states is the one in candidate with smallest new_log_mu - return candidate[ np.argmin(new_log_mu[candidate]) ] - - -def hill_climbing_integer_copynumber_fixdiploid(new_log_mu, base_nb_mean, new_p_binom, pred_cnv, max_allele_copy=5, max_total_copy=6, max_medploidy=4, \ - min_prop_threshold=0.1, EPS_BAF=0.05, nonbalance_bafdist=None, nondiploid_rdrdist=None, enforce_states={}): + return candidate[np.argmin(new_log_mu[candidate])] + + +def hill_climbing_integer_copynumber_fixdiploid( + new_log_mu, + base_nb_mean, + new_p_binom, + pred_cnv, + max_allele_copy=5, + max_total_copy=6, + max_medploidy=4, + min_prop_threshold=0.1, + EPS_BAF=0.05, + nonbalance_bafdist=None, + nondiploid_rdrdist=None, + enforce_states={}, +): n_states = len(new_log_mu) lambd = base_nb_mean / np.sum(base_nb_mean) - weight_per_state = np.array([ np.sum(lambd[pred_cnv == s]) for s in range(n_states)]) + weight_per_state = np.array([np.sum(lambd[pred_cnv == s]) for s in range(n_states)]) mu = np.exp(new_log_mu) + # def is_nondiploidnormal(k): if not nonbalance_bafdist is None: @@ -96,23 +118,37 @@ def is_nondiploidnormal(k): if np.abs(mu[k] - 1) > nondiploid_rdrdist: return True return False + # EPS_POINTS = 0.1 + def f(params, ploidy, scalefactor): # params of size (n_states, 2) - if np.any( np.sum(params, axis=1) == 0 ): + if np.any(np.sum(params, axis=1) == 0): return len(pred_cnv) * 1e6 frac_rdr = np.sum(params, axis=1) / scalefactor - frac_baf = params[:,0] / np.sum(params, axis=1) - points_per_state = np.bincount(pred_cnv, minlength=params.shape[0] ) + EPS_POINTS + frac_baf = params[:, 0] / np.sum(params, axis=1) + points_per_state = np.bincount(pred_cnv, minlength=params.shape[0]) + EPS_POINTS ### temp penalty ### mu_threshold = 0.3 - crucial_ordered_pairs_1 = (mu[:,None] - mu[None,:] > mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] < 0) - crucial_ordered_pairs_2 = (mu[:,None] - mu[None,:] < -mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] > 0) + crucial_ordered_pairs_1 = (mu[:, None] - mu[None, :] > mu_threshold) * ( + np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] < 0 + ) + crucial_ordered_pairs_2 = (mu[:, None] - mu[None, :] < -mu_threshold) * ( + np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] > 0 + ) # penalty on ploidy - derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum(points_per_state, axis=0) - return np.square(0.3 * (mu - frac_rdr)).dot(points_per_state) + np.square(new_p_binom - frac_baf).dot(points_per_state) + \ - np.sum(crucial_ordered_pairs_1) * len(pred_cnv) + np.sum(crucial_ordered_pairs_2) * len(pred_cnv) + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv) + derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum( + points_per_state, axis=0 + ) + return ( + np.square(0.3 * (mu - frac_rdr)).dot(points_per_state) + + np.square(new_p_binom - frac_baf).dot(points_per_state) + + np.sum(crucial_ordered_pairs_1) * len(pred_cnv) + + np.sum(crucial_ordered_pairs_2) * len(pred_cnv) + + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv) + ) + # def hill_climb(initial_params, ploidy, idx_diploid_normal, max_iter=10): scalefactor = 2.0 / mu[idx_diploid_normal] @@ -125,35 +161,51 @@ def hill_climb(initial_params, ploidy, idx_diploid_normal, max_iter=10): if k == idx_diploid_normal or k in enforce_states: continue this_best_obj = best_obj - this_best_k = copy.copy(params[k,:]) + this_best_k = copy.copy(params[k, :]) for candi in candidates: if is_nondiploidnormal(k) and candi[0] == 1 and candi[1] == 1: continue - params[k,:] = candi + params[k, :] = candi obj = f(params, ploidy, scalefactor) if obj < this_best_obj: this_best_obj = obj this_best_k = candi - increased = (increased | (this_best_obj < best_obj)) - params[k,:] = this_best_k + increased = increased | (this_best_obj < best_obj) + params[k, :] = this_best_k best_obj = this_best_obj if not increased: break return params, best_obj + # diploid normal state - idx_diploid_normal = find_diploid_balanced_state(new_log_mu, new_p_binom, pred_cnv, min_prop_threshold=min_prop_threshold, EPS_BAF=EPS_BAF) + idx_diploid_normal = find_diploid_balanced_state( + new_log_mu, + new_p_binom, + pred_cnv, + min_prop_threshold=min_prop_threshold, + EPS_BAF=EPS_BAF, + ) # candidate integer copy states - candidates = np.array([ [i,j] for i in range(max_allele_copy + 1) for j in range(max_allele_copy+1) if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)]) + candidates = np.array( + [ + [i, j] + for i in range(max_allele_copy + 1) + for j in range(max_allele_copy + 1) + if (not (i == 0 and j == 0)) and (i + j <= max_total_copy) + ] + ) # find the best copy number states starting from various ploidy best_obj = np.inf best_integer_copies = np.zeros((n_states, 2), dtype=int) - for ploidy in range(1, max_medploidy+1): + for ploidy in range(1, max_medploidy + 1): # initial_params = np.array([ [1,1] if not is_nondiploidnormal(k) else [1,0] for k in range(n_states)], dtype=int) np.random.seed(0) for r in range(20): - initial_params = candidates[ np.random.randint(low=0, high=candidates.shape[0], size=n_states), : ] - initial_params[idx_diploid_normal] = np.array([1,1]) - for k,v in enforce_states.items(): + initial_params = candidates[ + np.random.randint(low=0, high=candidates.shape[0], size=n_states), : + ] + initial_params[idx_diploid_normal] = np.array([1, 1]) + for k, v in enforce_states.items(): initial_params[k] = v params, obj = hill_climb(initial_params, ploidy, idx_diploid_normal) if obj < best_obj: @@ -162,38 +214,66 @@ def hill_climb(initial_params, ploidy, idx_diploid_normal, max_iter=10): return best_integer_copies, best_obj -def hill_climbing_integer_copynumber_oneclone(new_log_mu, base_nb_mean, new_p_binom, pred_cnv, max_allele_copy=5, max_total_copy=6, max_medploidy=4, enforce_states={}, EPS_BAF=0.05): +def hill_climbing_integer_copynumber_oneclone( + new_log_mu, + base_nb_mean, + new_p_binom, + pred_cnv, + max_allele_copy=5, + max_total_copy=6, + max_medploidy=4, + enforce_states={}, + EPS_BAF=0.05, +): n_states = len(new_log_mu) lambd = base_nb_mean / np.sum(base_nb_mean) - weight_per_state = np.array([ np.sum(lambd[pred_cnv == s]) for s in range(n_states)]) + weight_per_state = np.array([np.sum(lambd[pred_cnv == s]) for s in range(n_states)]) mu = np.exp(new_log_mu) # EPS_POINTS = 0.1 + def f(params, ploidy): # params of size (n_states, 2) - if np.any( np.sum(params, axis=1) == 0 ): + if np.any(np.sum(params, axis=1) == 0): return len(pred_cnv) * 1e6 - denom = weight_per_state.dot( np.sum(params, axis=1) ) + denom = weight_per_state.dot(np.sum(params, axis=1)) frac_rdr = np.sum(params, axis=1) / denom - frac_baf = params[:,0] / np.sum(params, axis=1) - points_per_state = np.bincount(pred_cnv, minlength=params.shape[0] ) + EPS_POINTS + frac_baf = params[:, 0] / np.sum(params, axis=1) + points_per_state = np.bincount(pred_cnv, minlength=params.shape[0]) + EPS_POINTS ### temp penalty ### mu_threshold = 0.3 - crucial_ordered_pairs_1 = (mu[:,None] - mu[None,:] > mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] < 0) - crucial_ordered_pairs_2 = (mu[:,None] - mu[None,:] < -mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] > 0) + crucial_ordered_pairs_1 = (mu[:, None] - mu[None, :] > mu_threshold) * ( + np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] < 0 + ) + crucial_ordered_pairs_2 = (mu[:, None] - mu[None, :] < -mu_threshold) * ( + np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] > 0 + ) # penalty on setting unbalanced states when BAF is close to 0.5 - if np.sum(params[:,0] == params[:,1]) > 0: - baf_threshold = max(EPS_BAF, np.max(np.abs(new_p_binom[(params[:,0]==params[:,1])] - 0.5))) + if np.sum(params[:, 0] == params[:, 1]) > 0: + baf_threshold = max( + EPS_BAF, + np.max(np.abs(new_p_binom[(params[:, 0] == params[:, 1])] - 0.5)), + ) else: baf_threshold = EPS_BAF - unbalanced_penalty = (params[:,0] != params[:,1]).dot(np.abs(new_p_binom - 0.5) < baf_threshold) + unbalanced_penalty = (params[:, 0] != params[:, 1]).dot( + np.abs(new_p_binom - 0.5) < baf_threshold + ) # penalty on ploidy - derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum(points_per_state, axis=0) - return np.square(0.3 * (mu - frac_rdr)).dot(points_per_state) + np.square(new_p_binom - frac_baf).dot(points_per_state) + \ - np.sum(crucial_ordered_pairs_1) * len(pred_cnv) + np.sum(crucial_ordered_pairs_2) * len(pred_cnv) + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv) + \ - unbalanced_penalty * len(pred_cnv) + derived_ploidy = np.sum(params, axis=1).dot(points_per_state) / np.sum( + points_per_state, axis=0 + ) + return ( + np.square(0.3 * (mu - frac_rdr)).dot(points_per_state) + + np.square(new_p_binom - frac_baf).dot(points_per_state) + + np.sum(crucial_ordered_pairs_1) * len(pred_cnv) + + np.sum(crucial_ordered_pairs_2) * len(pred_cnv) + + np.sum(derived_ploidy > ploidy + 0.5) * len(pred_cnv) + + unbalanced_penalty * len(pred_cnv) + ) ### end temp penalty ### # return np.abs(mu - frac_rdr).dot(points_per_state) + 5 * np.abs(new_p_binom - frac_baf).dot(points_per_state) + def hill_climb(initial_params, ploidy, max_iter=10): best_obj = f(initial_params, ploidy) params = copy.copy(initial_params) @@ -204,29 +284,37 @@ def hill_climb(initial_params, ploidy, max_iter=10): if k in enforce_states: continue this_best_obj = best_obj - this_best_k = copy.copy(params[k,:]) + this_best_k = copy.copy(params[k, :]) for candi in candidates: - params[k,:] = candi + params[k, :] = candi obj = f(params, ploidy) if obj < this_best_obj: # print(k, candi, obj, this_best_obj, ploidy+1, 0.1 * np.maximum(0, np.sum(params[k,:]) - ploidy-1) * np.sum(pred_cnv==k)) this_best_obj = obj this_best_k = candi - increased = (increased | (this_best_obj < best_obj)) - params[k,:] = this_best_k + increased = increased | (this_best_obj < best_obj) + params[k, :] = this_best_k best_obj = this_best_obj if not increased: break return params, best_obj + # candidate integer copy states - candidates = np.array([ [i,j] for i in range(max_allele_copy + 1) for j in range(max_allele_copy+1) if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)]) + candidates = np.array( + [ + [i, j] + for i in range(max_allele_copy + 1) + for j in range(max_allele_copy + 1) + if (not (i == 0 and j == 0)) and (i + j <= max_total_copy) + ] + ) # find the best copy number states starting from various ploidy best_obj = np.inf best_integer_copies = np.zeros((n_states, 2), dtype=int) - for ploidy in range(1, max_medploidy+1): + for ploidy in range(1, max_medploidy + 1): initial_params = np.ones((n_states, 2), dtype=int) * int(ploidy / 2) initial_params[:, 1] = ploidy - initial_params[:, 0] - for k,v in enforce_states.items(): + for k, v in enforce_states.items(): initial_params[k] = v params, obj = hill_climb(initial_params, ploidy) if obj < best_obj: @@ -235,10 +323,18 @@ def hill_climb(initial_params, ploidy, max_iter=10): return best_integer_copies, best_obj -def hill_climbing_integer_copynumber_joint(new_log_mu, base_nb_mean, new_p_binom, pred_cnv, max_allele_copy=5, max_total_copy=6, max_medploidy=4): +def hill_climbing_integer_copynumber_joint( + new_log_mu, + base_nb_mean, + new_p_binom, + pred_cnv, + max_allele_copy=5, + max_total_copy=6, + max_medploidy=4, +): """ Jointly infer copy numbers across multiple clones, given they share the same set of new_log_mu and new_p_binom parameters. - + Attributes: ---------- new_log_mu : array of size (n_states, n_clones) @@ -255,27 +351,55 @@ def hill_climbing_integer_copynumber_joint(new_log_mu, base_nb_mean, new_p_binom """ n_states = new_log_mu.shape[0] n_clones = base_nb_mean.shape[1] - lambd = np.sum(base_nb_mean,axis=1) / np.sum(base_nb_mean) - weight_per_state = np.array([[ np.sum(lambd[pred_cnv[:,c] == s]) for s in range(n_states)] for c in range(n_clones)]).T # size of (n_states, n_clones) + lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean) + weight_per_state = np.array( + [ + [np.sum(lambd[pred_cnv[:, c] == s]) for s in range(n_states)] + for c in range(n_clones) + ] + ).T # size of (n_states, n_clones) mu = np.exp(new_log_mu) + def f(params, ploidy): # params of size (n_states, 2) - if np.any( np.sum(params, axis=1) == 0 ): + if np.any(np.sum(params, axis=1) == 0): return len(pred_cnv) * 1e6 - denom = weight_per_state.T.dot( np.sum(params, axis=1) ) # size of (n_clones,) - frac_rdr = np.sum(params, axis=1).reshape(-1,1) / denom.reshape(1,-1) # size of (n_states, n_clones) - frac_baf = params[:,0] / np.sum(params, axis=1) - points_per_state = np.vstack([ np.bincount(pred_cnv[:,c], minlength=params.shape[0]) for c in range(n_clones) ]).T # size of (n_states, n_clones) + denom = weight_per_state.T.dot(np.sum(params, axis=1)) # size of (n_clones,) + frac_rdr = np.sum(params, axis=1).reshape(-1, 1) / denom.reshape( + 1, -1 + ) # size of (n_states, n_clones) + frac_baf = params[:, 0] / np.sum(params, axis=1) + points_per_state = np.vstack( + [ + np.bincount(pred_cnv[:, c], minlength=params.shape[0]) + for c in range(n_clones) + ] + ).T # size of (n_states, n_clones) ### temp penalty ### mu_threshold = 0.3 - crucial_ordered_pairs_1 = (mu[:,0][:,None] - mu[:,0][None,:] > mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] < 0) - crucial_ordered_pairs_2 = (mu[:,0][:,None] - mu[:,0][None,:] < -mu_threshold) * (np.sum(params, axis=1)[:,None] - np.sum(params, axis=1)[None,:] > 0) + crucial_ordered_pairs_1 = ( + mu[:, 0][:, None] - mu[:, 0][None, :] > mu_threshold + ) * (np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] < 0) + crucial_ordered_pairs_2 = ( + mu[:, 0][:, None] - mu[:, 0][None, :] < -mu_threshold + ) * (np.sum(params, axis=1)[:, None] - np.sum(params, axis=1)[None, :] > 0) # penalty on ploidy - derived_ploidy = np.median(np.sum(params, axis=1).dot(points_per_state) / np.sum(points_per_state, axis=0)) - return np.sum(np.square(0.3 * (mu - frac_rdr) * points_per_state)) + np.sum(np.square((new_p_binom - frac_baf).reshape(-1,1) * points_per_state)) + \ - np.sum(crucial_ordered_pairs_1) * np.prod(pred_cnv.shape) + np.sum(crucial_ordered_pairs_2) * np.prod(pred_cnv.shape) + np.sum(derived_ploidy > ploidy + 0.5) * np.prod(pred_cnv.shape) + derived_ploidy = np.median( + np.sum(params, axis=1).dot(points_per_state) + / np.sum(points_per_state, axis=0) + ) + return ( + np.sum(np.square(0.3 * (mu - frac_rdr) * points_per_state)) + + np.sum( + np.square((new_p_binom - frac_baf).reshape(-1, 1) * points_per_state) + ) + + np.sum(crucial_ordered_pairs_1) * np.prod(pred_cnv.shape) + + np.sum(crucial_ordered_pairs_2) * np.prod(pred_cnv.shape) + + np.sum(derived_ploidy > ploidy + 0.5) * np.prod(pred_cnv.shape) + ) ### end temp penalty ### # return np.abs(mu - frac_rdr).dot(points_per_state) + 5 * np.abs(new_p_binom - frac_baf).dot(points_per_state) + def hill_climb(initial_params, ploidy, max_iter=10): best_obj = f(initial_params, ploidy) params = copy.copy(initial_params) @@ -284,29 +408,37 @@ def hill_climb(initial_params, ploidy, max_iter=10): increased = False for k in range(params.shape[0]): this_best_obj = best_obj - this_best_k = copy.copy(params[k,:]) + this_best_k = copy.copy(params[k, :]) for candi in candidates: - params[k,:] = candi + params[k, :] = candi obj = f(params, ploidy) if obj < this_best_obj: # print(k, candi, obj, this_best_obj, ploidy+1, 0.1 * np.maximum(0, np.sum(params[k,:]) - ploidy-1) * np.sum(pred_cnv==k)) this_best_obj = obj this_best_k = candi - increased = (increased | (this_best_obj < best_obj)) - params[k,:] = this_best_k + increased = increased | (this_best_obj < best_obj) + params[k, :] = this_best_k best_obj = this_best_obj if not increased: break return params, best_obj + # candidate integer copy states - candidates = np.array([ [i,j] for i in range(max_allele_copy + 1) for j in range(max_allele_copy+1) if (not (i == 0 and j == 0)) and (i + j <= max_total_copy)]) + candidates = np.array( + [ + [i, j] + for i in range(max_allele_copy + 1) + for j in range(max_allele_copy + 1) + if (not (i == 0 and j == 0)) and (i + j <= max_total_copy) + ] + ) # find the best copy number states starting from various ploidy best_obj = np.inf best_integer_copies = np.zeros((n_states, 2), dtype=int) # fix the genomic bin with the median new_log_mu to have exactly ploidy genomes # bidx_med = np.argsort(np.concatenate([ new_log_mu[pred_cnv[:,c],c] for c in range(n_clones) ]))[ int(len(pred_cnv.flatten())/2) ] # idx_med = pred_cnv.flatten(order="F")[bidx_med] - for ploidy in range(1, max_medploidy+1): + for ploidy in range(1, max_medploidy + 1): initial_params = np.ones((n_states, 2), dtype=int) * int(ploidy / 2) initial_params[:, 1] = ploidy - initial_params[:, 0] params, obj = hill_climb(initial_params, ploidy) @@ -318,17 +450,19 @@ def hill_climb(initial_params, ploidy, max_iter=10): def get_genelevel_cnv_oneclone(A_copy, B_copy, x_gene_list): map_gene_bin = {} - for i,x in enumerate(x_gene_list): + for i, x in enumerate(x_gene_list): this_genes = [z for z in x.split(" ") if z != ""] for g in this_genes: map_gene_bin[g] = i gene_list = np.sort(np.array(list(map_gene_bin.keys()))) - gene_level_copies = np.zeros( (len(gene_list), 2), dtype=int ) - for i,g in enumerate(gene_list): + gene_level_copies = np.zeros((len(gene_list), 2), dtype=int) + for i, g in enumerate(gene_list): idx = map_gene_bin[g] gene_level_copies[i, 0] = A_copy[idx] gene_level_copies[i, 1] = B_copy[idx] - return pd.DataFrame({"A":gene_level_copies[:,0], "B":gene_level_copies[:,1]}, index=gene_list) + return pd.DataFrame( + {"A": gene_level_copies[:, 0], "B": gene_level_copies[:, 1]}, index=gene_list + ) def convert_copy_to_states(A_copy, B_copy): @@ -336,11 +470,11 @@ def convert_copy_to_states(A_copy, B_copy): tmp = tmp[~np.isnan(tmp)] base_ploidy = np.median(tmp) coarse_states = np.array(["neutral"] * A_copy.shape[0]) - coarse_states[ (A_copy + B_copy < base_ploidy) & (A_copy != B_copy) ] = "del" - coarse_states[ (A_copy + B_copy < base_ploidy) & (A_copy == B_copy) ] = "bdel" - coarse_states[ (A_copy + B_copy > base_ploidy) & (A_copy != B_copy) ] = "amp" - coarse_states[ (A_copy + B_copy > base_ploidy) & (A_copy == B_copy) ] = "bamp" - coarse_states[ (A_copy + B_copy == base_ploidy) & (A_copy != B_copy) ] = "loh" + coarse_states[(A_copy + B_copy < base_ploidy) & (A_copy != B_copy)] = "del" + coarse_states[(A_copy + B_copy < base_ploidy) & (A_copy == B_copy)] = "bdel" + coarse_states[(A_copy + B_copy > base_ploidy) & (A_copy != B_copy)] = "amp" + coarse_states[(A_copy + B_copy > base_ploidy) & (A_copy == B_copy)] = "bamp" + coarse_states[(A_copy + B_copy == base_ploidy) & (A_copy != B_copy)] = "loh" coarse_states[coarse_states == "neutral"] = "neu" return coarse_states @@ -677,4 +811,4 @@ def composite_hmm_eval_objective(base_nb_mean, total_bb_RD, new_log_mu, new_scal # except AttributeError: # print('Encountered an attribute error') -""" \ No newline at end of file +""" diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index 2611340..2a262aa 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -19,8 +19,9 @@ # whole inference ############################################################ + class hmm_nophasing(object): - def __init__(self, params="stmp", t=1-1e-4): + def __init__(self, params="stmp", t=1 - 1e-4): """ Attributes ---------- @@ -32,9 +33,12 @@ def __init__(self, params="stmp", t=1-1e-4): """ self.params = params self.t = t + # @staticmethod - def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus): + def compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ): """ Attributes ---------- @@ -58,7 +62,7 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t taus : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (n_states, n_obs, n_spots) @@ -74,20 +78,40 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution - idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: - nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( + X[idx_nonzero_rdr, 0, s], n, p + ) # AF from BetaBinom distribution - idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: - log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s]) + log_emission_baf[i, idx_nonzero_baf, s] = ( + scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + p_binom[i, s] * taus[i, s], + (1 - p_binom[i, s]) * taus[i, s], + ) + ) return log_emission_rdr, log_emission_baf + # @staticmethod - def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs): + def compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + **kwargs, + ): """ Attributes ---------- @@ -111,7 +135,7 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha taus : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (n_states, n_obs, n_spots) @@ -127,27 +151,47 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution - idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s]) - nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s]) + nb_mean = base_nb_mean[idx_nonzero_rdr, s] * ( + tumor_prop[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) + + 1 + - tumor_prop[idx_nonzero_rdr, s] + ) nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( + X[idx_nonzero_rdr, 0, s], n, p + ) # AF from BetaBinom distribution - idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: # mix_p_A = p_binom[i, s] * tumor_prop[s] + 0.5 * (1 - tumor_prop[s]) # mix_p_B = (1 - p_binom[i, s]) * tumor_prop[s] + 0.5 * (1 - tumor_prop[s]) - mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s]) - mix_p_B = (1 - p_binom[i, s]) * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s]) - log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s]) + mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf, s] + 0.5 * ( + 1 - tumor_prop[idx_nonzero_baf, s] + ) + mix_p_B = (1 - p_binom[i, s]) * tumor_prop[ + idx_nonzero_baf, s + ] + 0.5 * (1 - tumor_prop[idx_nonzero_baf, s]) + log_emission_baf[ + i, idx_nonzero_baf, s + ] += scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + mix_p_A * taus[i, s], + mix_p_B * taus[i, s], + ) return log_emission_rdr, log_emission_baf + # @staticmethod - @njit - def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' + @njit + def forward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ): + """ Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. Input lengths: sum of lengths = n_observations. @@ -156,32 +200,43 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site log_emission: n_states * n_observations * n_spots. Log probability. Output log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = log_emission.shape[0] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" # initialize log_alpha log_alpha = np.zeros((log_emission.shape[0], n_obs)) buf = np.zeros(log_emission.shape[0]) cumlen = 0 for le in lengths: # start prob - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1) + log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze( + log_emission[:, cumlen, :], axis=1 + ) for t in np.arange(1, le): for j in np.arange(log_emission.shape[0]): for i in np.arange(log_emission.shape[0]): buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j] - log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :]) + log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum( + log_emission[j, (cumlen + t), :] + ) cumlen += le return log_alpha + # @staticmethod - @njit - def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' + @njit + def backward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ): + """ Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. Input X: size n_observations * n_components * n_spots. @@ -191,33 +246,61 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit log_emission: n_states * n_observations * n_spots. Log probability. Output log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = log_emission.shape[0] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" # initialize log_beta log_beta = np.zeros((log_emission.shape[0], n_obs)) buf = np.zeros(log_emission.shape[0]) cumlen = 0 for le in lengths: # start prob - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. log_beta[:, (cumlen + le - 1)] = 0 - for t in np.arange(le-2, -1, -1): + for t in np.arange(le - 2, -1, -1): for i in np.arange(log_emission.shape[0]): for j in np.arange(log_emission.shape[0]): - buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :]) + buf[j] = ( + log_beta[j, (cumlen + t + 1)] + + log_transmat[i, j] + + np.sum(log_emission[j, (cumlen + t + 1), :]) + ) log_beta[i, (cumlen + t)] = mylogsumexp(buf) cumlen += le return log_beta # - def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, tp_weight_by_mu=None, \ - fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs): - ''' + def run_baum_welch_nb_bb( + self, + X, + lengths, + n_states, + base_nb_mean, + total_bb_RD, + log_sitewise_transmat=None, + tumor_prop=None, + tp_weight_by_mu=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + is_diag=False, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + max_iter=100, + tol=1e-4, + **kwargs, + ): + """ Input X: size n_observations * n_components * n_spots. lengths: sum of lengths = n_observations. @@ -226,41 +309,84 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, Intermediate log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. alpha: size of n_states. Dispersioon parameter of each HMM state. - ''' + """ n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 # initialize NB logmean shift and BetaBinom prob - log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu - p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom + log_mu = ( + np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T + if init_log_mu is None + else init_log_mu + ) + p_binom = ( + np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T + if init_p_binom is None + else init_p_binom + ) # initialize (inverse of) dispersion param in NB and BetaBinom - alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + alphas = ( + 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus # initialize start probability and emission probability - log_startprob = np.log( np.ones(n_states) / n_states ) + log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: - transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1) + transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: - log_transmat = np.zeros((1,1)) + log_transmat = np.zeros((1, 1)) # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) - unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean) - unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD) + unique_values_nb, mapping_matrices_nb = construct_unique_matrix( + X[:, 0, :], base_nb_mean + ) + unique_values_bb, mapping_matrices_bb = construct_unique_matrix( + X[:, 1, :], total_bb_RD + ) # EM algorithm for r in trange(max_iter): # E step if tumor_prop is None: - log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission_rdr, log_emission_baf = ( + hmm_nophasing.compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ) + ) log_emission = log_emission_rdr + log_emission_baf else: - log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop) + log_emission_rdr, log_emission_baf = ( + hmm_nophasing.compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + ) + ) log_emission = log_emission_rdr + log_emission_baf - log_alpha = hmm_nophasing.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) - log_beta = hmm_nophasing.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_alpha = hmm_nophasing.forward_lattice( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) + log_beta = hmm_nophasing.backward_lattice( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission) + log_xi = compute_posterior_transition_nophasing( + log_alpha, log_beta, log_transmat, log_emission + ) # M step if "s" in self.params: new_log_startprob = update_startprob_nophasing(lengths, log_gamma) @@ -273,32 +399,75 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, new_log_transmat = log_transmat if "m" in self.params: if tumor_prop is None: - new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + new_log_mu, new_alphas = ( + update_emission_params_nb_nophasing_uniqvalues( + unique_values_nb, + mapping_matrices_nb, + log_gamma, + alphas, + start_log_mu=log_mu, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + ) + ) else: - new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + new_log_mu, new_alphas = ( + update_emission_params_nb_nophasing_uniqvalues_mix( + unique_values_nb, + mapping_matrices_nb, + log_gamma, + alphas, + tumor_prop, + start_log_mu=log_mu, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + ) + ) else: new_log_mu = log_mu new_alphas = alphas if "p" in self.params: if tumor_prop is None: - new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + new_p_binom, new_taus = ( + update_emission_params_bb_nophasing_uniqvalues( + unique_values_bb, + mapping_matrices_bb, + log_gamma, + taus, + start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + ) + ) else: - new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, tumor_prop, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + new_p_binom, new_taus = ( + update_emission_params_bb_nophasing_uniqvalues_mix( + unique_values_bb, + mapping_matrices_bb, + log_gamma, + taus, + tumor_prop, + start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + ) + ) else: new_p_binom = p_binom new_taus = taus # check convergence - print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \ - np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \ - np.mean(np.abs(new_log_mu - log_mu)),\ - np.mean(np.abs(new_p_binom - p_binom)) ) - print( np.hstack([new_log_mu, new_p_binom]) ) - if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \ - np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol: + print( + np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), + np.mean(np.abs(new_log_mu - log_mu)), + np.mean(np.abs(new_p_binom - p_binom)), + ) + print(np.hstack([new_log_mu, new_p_binom])) + if ( + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol + and np.mean(np.abs(new_log_mu - log_mu)) < tol + and np.mean(np.abs(new_p_binom - p_binom)) < tol + ): break log_startprob = new_log_startprob log_transmat = new_log_transmat @@ -306,6 +475,12 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, alphas = new_alphas p_binom = new_p_binom taus = new_taus - return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma - - + return ( + new_log_mu, + new_alphas, + new_p_binom, + new_taus, + new_log_startprob, + new_log_transmat, + log_gamma, + ) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index d5a9145..2563834 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -22,8 +22,9 @@ # whole inference ############################################################ + class hmm_nophasing_v2(object): - def __init__(self, params="stmp", t=1-1e-4): + def __init__(self, params="stmp", t=1 - 1e-4): """ Attributes ---------- @@ -35,9 +36,12 @@ def __init__(self, params="stmp", t=1-1e-4): """ self.params = params self.t = t + # @staticmethod - def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus): + def compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ): """ Attributes ---------- @@ -61,7 +65,7 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t taus : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (n_states, n_obs, n_spots) @@ -77,20 +81,40 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution - idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: - nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( + X[idx_nonzero_rdr, 0, s], n, p + ) # AF from BetaBinom distribution - idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: - log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s]) + log_emission_baf[i, idx_nonzero_baf, s] = ( + scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + p_binom[i, s] * taus[i, s], + (1 - p_binom[i, s]) * taus[i, s], + ) + ) return log_emission_rdr, log_emission_baf + # @staticmethod - def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs): + def compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + **kwargs, + ): """ Attributes ---------- @@ -114,7 +138,7 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha taus : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (n_states, n_obs, n_spots) @@ -130,34 +154,63 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution - idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s]) - nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s]) + nb_mean = base_nb_mean[idx_nonzero_rdr, s] * ( + tumor_prop[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) + + 1 + - tumor_prop[idx_nonzero_rdr, s] + ) nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( + X[idx_nonzero_rdr, 0, s], n, p + ) # AF from BetaBinom distribution if ("logmu_shift" in kwargs) and ("sample_length" in kwargs): this_weighted_tp = [] for c in range(len(kwargs["sample_length"])): range_s = np.sum(kwargs["sample_length"][:c]) - range_t = np.sum(kwargs["sample_length"][:(c+1)]) - this_weighted_tp.append( tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) / (tumor_prop[range_s:range_t,s] * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c,s]) + 1 - tumor_prop[range_s:range_t,s]) ) + range_t = np.sum(kwargs["sample_length"][: (c + 1)]) + this_weighted_tp.append( + tumor_prop[range_s:range_t, s] + * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c, s]) + / ( + tumor_prop[range_s:range_t, s] + * np.exp(log_mu[i, s] - kwargs["logmu_shift"][c, s]) + + 1 + - tumor_prop[range_s:range_t, s] + ) + ) this_weighted_tp = np.concatenate(this_weighted_tp) else: - this_weighted_tp = tumor_prop[:,s] - idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + this_weighted_tp = tumor_prop[:, s] + idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: - mix_p_A = p_binom[i, s] * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf]) - mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf]) - log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s]) + mix_p_A = p_binom[i, s] * this_weighted_tp[ + idx_nonzero_baf + ] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf]) + mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[ + idx_nonzero_baf + ] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf]) + log_emission_baf[ + i, idx_nonzero_baf, s + ] += scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + mix_p_A * taus[i, s], + mix_p_B * taus[i, s], + ) return log_emission_rdr, log_emission_baf + # @staticmethod - @njit - def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' + @njit + def forward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ): + """ Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. Input lengths: sum of lengths = n_observations. @@ -166,32 +219,43 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site log_emission: n_states * n_observations * n_spots. Log probability. Output log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = log_emission.shape[0] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" # initialize log_alpha log_alpha = np.zeros((log_emission.shape[0], n_obs)) buf = np.zeros(log_emission.shape[0]) cumlen = 0 for le in lengths: # start prob - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1) + log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze( + log_emission[:, cumlen, :], axis=1 + ) for t in np.arange(1, le): for j in np.arange(log_emission.shape[0]): for i in np.arange(log_emission.shape[0]): buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j] - log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :]) + log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum( + log_emission[j, (cumlen + t), :] + ) cumlen += le return log_alpha + # @staticmethod - @njit - def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' + @njit + def backward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ): + """ Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. Input X: size n_observations * n_components * n_spots. @@ -201,33 +265,60 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit log_emission: n_states * n_observations * n_spots. Log probability. Output log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = log_emission.shape[0] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" # initialize log_beta log_beta = np.zeros((log_emission.shape[0], n_obs)) buf = np.zeros(log_emission.shape[0]) cumlen = 0 for le in lengths: # start prob - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. log_beta[:, (cumlen + le - 1)] = 0 - for t in np.arange(le-2, -1, -1): + for t in np.arange(le - 2, -1, -1): for i in np.arange(log_emission.shape[0]): for j in np.arange(log_emission.shape[0]): - buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :]) + buf[j] = ( + log_beta[j, (cumlen + t + 1)] + + log_transmat[i, j] + + np.sum(log_emission[j, (cumlen + t + 1), :]) + ) log_beta[i, (cumlen + t)] = mylogsumexp(buf) cumlen += le return log_beta # - def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, \ - fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs): - ''' + def run_baum_welch_nb_bb( + self, + X, + lengths, + n_states, + base_nb_mean, + total_bb_RD, + log_sitewise_transmat=None, + tumor_prop=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + is_diag=False, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + max_iter=100, + tol=1e-4, + **kwargs, + ): + """ Input X: size n_observations * n_components * n_spots. lengths: sum of lengths = n_observations. @@ -236,52 +327,125 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, Intermediate log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. alpha: size of n_states. Dispersioon parameter of each HMM state. - ''' + """ n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 # initialize NB logmean shift and BetaBinom prob - log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu - p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom + log_mu = ( + np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T + if init_log_mu is None + else init_log_mu + ) + p_binom = ( + np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T + if init_p_binom is None + else init_p_binom + ) # initialize (inverse of) dispersion param in NB and BetaBinom - alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + alphas = ( + 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus # initialize start probability and emission probability - log_startprob = np.log( np.ones(n_states) / n_states ) + log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: - transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1) + transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: - log_transmat = np.zeros((1,1)) + log_transmat = np.zeros((1, 1)) # initialize log_gamma log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) - unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean) - unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD) + unique_values_nb, mapping_matrices_nb = construct_unique_matrix( + X[:, 0, :], base_nb_mean + ) + unique_values_bb, mapping_matrices_bb = construct_unique_matrix( + X[:, 1, :], total_bb_RD + ) # EM algorithm for r in trange(max_iter): # E step if tumor_prop is None: - log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission_rdr, log_emission_baf = ( + hmm_nophasing_v2.compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ) + ) log_emission = log_emission_rdr + log_emission_baf else: # compute mu as adjusted RDR if ((not log_gamma is None) or (r > 0)) and ("m" in self.params): logmu_shift = [] for c in range(len(kwargs["sample_length"])): - this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states - logmu_shift.append( scipy.special.logsumexp(log_mu[this_pred_cnv,:] + np.log(kwargs["lambd"]).reshape(-1,1), axis=0) ) + this_pred_cnv = ( + np.argmax( + log_gamma[ + :, + np.sum(kwargs["sample_length"][:c]) : np.sum( + kwargs["sample_length"][: (c + 1)] + ), + ], + axis=0, + ) + % n_states + ) + logmu_shift.append( + scipy.special.logsumexp( + log_mu[this_pred_cnv, :] + + np.log(kwargs["lambd"]).reshape(-1, 1), + axis=0, + ) + ) logmu_shift = np.vstack(logmu_shift) - log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, logmu_shift=logmu_shift, sample_length=kwargs["sample_length"]) + log_emission_rdr, log_emission_baf = ( + hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + logmu_shift=logmu_shift, + sample_length=kwargs["sample_length"], + ) + ) else: - log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop) + log_emission_rdr, log_emission_baf = ( + hmm_nophasing_v2.compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + ) + ) log_emission = log_emission_rdr + log_emission_baf - log_alpha = hmm_nophasing_v2.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) - log_beta = hmm_nophasing_v2.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_alpha = hmm_nophasing_v2.forward_lattice( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) + log_beta = hmm_nophasing_v2.backward_lattice( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission) + log_xi = compute_posterior_transition_nophasing( + log_alpha, log_beta, log_transmat, log_emission + ) # M step if "s" in self.params: new_log_startprob = update_startprob_nophasing(lengths, log_gamma) @@ -294,42 +458,106 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, new_log_transmat = log_transmat if "m" in self.params: if tumor_prop is None: - new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + new_log_mu, new_alphas = ( + update_emission_params_nb_nophasing_uniqvalues( + unique_values_nb, + mapping_matrices_nb, + log_gamma, + alphas, + start_log_mu=log_mu, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + ) + ) else: - new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + new_log_mu, new_alphas = ( + update_emission_params_nb_nophasing_uniqvalues_mix( + unique_values_nb, + mapping_matrices_nb, + log_gamma, + alphas, + tumor_prop, + start_log_mu=log_mu, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + ) + ) else: new_log_mu = log_mu new_alphas = alphas if "p" in self.params: if tumor_prop is None: - new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + new_p_binom, new_taus = ( + update_emission_params_bb_nophasing_uniqvalues( + unique_values_bb, + mapping_matrices_bb, + log_gamma, + taus, + start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + ) + ) else: # compute mu as adjusted RDR - if ("m" in self.params): + if "m" in self.params: mu = [] for c in range(len(kwargs["sample_length"])): - this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states - mu.append( np.exp(new_log_mu[this_pred_cnv,:]) / np.sum(np.exp(new_log_mu[this_pred_cnv,:]) * kwargs["lambd"].reshape(-1,1), axis=0, keepdims=True) ) + this_pred_cnv = ( + np.argmax( + log_gamma[ + :, + np.sum(kwargs["sample_length"][:c]) : np.sum( + kwargs["sample_length"][: (c + 1)] + ), + ], + axis=0, + ) + % n_states + ) + mu.append( + np.exp(new_log_mu[this_pred_cnv, :]) + / np.sum( + np.exp(new_log_mu[this_pred_cnv, :]) + * kwargs["lambd"].reshape(-1, 1), + axis=0, + keepdims=True, + ) + ) mu = np.vstack(mu) - weighted_tp = (tumor_prop * mu) / (tumor_prop * mu + 1 - tumor_prop) + weighted_tp = (tumor_prop * mu) / ( + tumor_prop * mu + 1 - tumor_prop + ) else: weighted_tp = tumor_prop - new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, weighted_tp, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + new_p_binom, new_taus = ( + update_emission_params_bb_nophasing_uniqvalues_mix( + unique_values_bb, + mapping_matrices_bb, + log_gamma, + taus, + weighted_tp, + start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + ) + ) else: new_p_binom = p_binom new_taus = taus # check convergence - print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \ - np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \ - np.mean(np.abs(new_log_mu - log_mu)),\ - np.mean(np.abs(new_p_binom - p_binom)) ) - print( np.hstack([new_log_mu, new_p_binom]) ) - if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \ - np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol: + print( + np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), + np.mean(np.abs(new_log_mu - log_mu)), + np.mean(np.abs(new_p_binom - p_binom)), + ) + print(np.hstack([new_log_mu, new_p_binom])) + if ( + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol + and np.mean(np.abs(new_log_mu - log_mu)) < tol + and np.mean(np.abs(new_p_binom - p_binom)) < tol + ): break log_startprob = new_log_startprob log_transmat = new_log_transmat @@ -337,6 +565,12 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, alphas = new_alphas p_binom = new_p_binom taus = new_taus - return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma - - + return ( + new_log_mu, + new_alphas, + new_p_binom, + new_taus, + new_log_startprob, + new_log_transmat, + log_gamma, + ) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 630651f..0d26b70 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -21,8 +21,9 @@ # whole inference ############################################################ + class hmm_sitewise(object): - def __init__(self, params="stmp", t=1-1e-4): + def __init__(self, params="stmp", t=1 - 1e-4): """ Attributes ---------- @@ -34,9 +35,12 @@ def __init__(self, params="stmp", t=1-1e-4): """ self.params = params self.t = t + # @staticmethod - def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus): + def compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ): """ Attributes ---------- @@ -60,7 +64,7 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t taus : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (2*n_states, n_obs, n_spots) @@ -76,22 +80,51 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution - idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: - nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) - log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = log_emission_rdr[i, idx_nonzero_rdr, s] + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( + X[idx_nonzero_rdr, 0, s], n, p + ) + log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = ( + log_emission_rdr[i, idx_nonzero_rdr, s] + ) # AF from BetaBinom distribution - idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: - log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s]) - log_emission_baf[i + n_states, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], (1-p_binom[i, s]) * taus[i, s], p_binom[i, s] * taus[i, s]) + log_emission_baf[i, idx_nonzero_baf, s] = ( + scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + p_binom[i, s] * taus[i, s], + (1 - p_binom[i, s]) * taus[i, s], + ) + ) + log_emission_baf[i + n_states, idx_nonzero_baf, s] = ( + scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + (1 - p_binom[i, s]) * taus[i, s], + p_binom[i, s] * taus[i, s], + ) + ) return log_emission_rdr, log_emission_baf + # @staticmethod - def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs): + def compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + **kwargs, + ): """ Attributes ---------- @@ -115,7 +148,7 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha taus : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (2*n_states, n_obs, n_spots) @@ -131,26 +164,55 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution - idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: - nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s]) + nb_mean = base_nb_mean[idx_nonzero_rdr, s] * ( + tumor_prop[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) + + 1 + - tumor_prop[idx_nonzero_rdr, s] + ) nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) - log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = log_emission_rdr[i, idx_nonzero_rdr, s] + log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( + X[idx_nonzero_rdr, 0, s], n, p + ) + log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = ( + log_emission_rdr[i, idx_nonzero_rdr, s] + ) # AF from BetaBinom distribution - idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: - mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s]) - mix_p_B = (1 - p_binom[i, s]) * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s]) - log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s]) - log_emission_baf[i + n_states, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_B * taus[i, s], mix_p_A * taus[i, s]) + mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf, s] + 0.5 * ( + 1 - tumor_prop[idx_nonzero_baf, s] + ) + mix_p_B = (1 - p_binom[i, s]) * tumor_prop[ + idx_nonzero_baf, s + ] + 0.5 * (1 - tumor_prop[idx_nonzero_baf, s]) + log_emission_baf[ + i, idx_nonzero_baf, s + ] += scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + mix_p_A * taus[i, s], + mix_p_B * taus[i, s], + ) + log_emission_baf[ + i + n_states, idx_nonzero_baf, s + ] += scipy.stats.betabinom.logpmf( + X[idx_nonzero_baf, 1, s], + total_bb_RD[idx_nonzero_baf, s], + mix_p_B * taus[i, s], + mix_p_A * taus[i, s], + ) return log_emission_rdr, log_emission_baf + # @staticmethod - @njit - def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' + @njit + def forward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ): + """ Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs. Input lengths: sum of lengths = n_observations. @@ -160,11 +222,15 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site log_sitewise_transmat: n_observations, the log transition probability of phase switch. Output log_alpha: size 2n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = int(np.ceil(log_emission.shape[0] / 2)) - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat)) # initialize log_alpha log_alpha = np.zeros((log_emission.shape[0], n_obs)) @@ -172,25 +238,49 @@ def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_site cumlen = 0 for le in lengths: # start prob - combined_log_startprob = np.log(0.5) + np.append(log_startprob,log_startprob) - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + combined_log_startprob = np.log(0.5) + np.append( + log_startprob, log_startprob + ) + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1) + log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze( + log_emission[:, cumlen, :], axis=1 + ) for t in np.arange(1, le): - phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t-1], log_sitewise_transmat[cumlen + t-1]], [log_sitewise_transmat[cumlen + t-1], log_sitewise_self_transmat[cumlen + t-1] ]]) - combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) ) + phases_switch_mat = np.array( + [ + [ + log_sitewise_self_transmat[cumlen + t - 1], + log_sitewise_transmat[cumlen + t - 1], + ], + [ + log_sitewise_transmat[cumlen + t - 1], + log_sitewise_self_transmat[cumlen + t - 1], + ], + ] + ) + combined_transmat = np.kron( + np.exp(phases_switch_mat), np.exp(log_transmat) + ) combined_transmat = np.log(combined_transmat) for j in np.arange(log_emission.shape[0]): for i in np.arange(log_emission.shape[0]): - buf[i] = log_alpha[i, (cumlen + t - 1)] + combined_transmat[i, j] - log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :]) + buf[i] = ( + log_alpha[i, (cumlen + t - 1)] + combined_transmat[i, j] + ) + log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum( + log_emission[j, (cumlen + t), :] + ) cumlen += le return log_alpha + # @staticmethod @njit - def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' + def backward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ): + """ Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs. Input X: size n_observations * n_components * n_spots. @@ -201,11 +291,15 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit log_sitewise_transmat: n_observations, the log transition probability of phase switch. Output log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = int(np.ceil(log_emission.shape[0] / 2)) - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat)) # initialize log_beta log_beta = np.zeros((log_emission.shape[0], n_obs)) @@ -213,24 +307,60 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit cumlen = 0 for le in lengths: # start prob - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. log_beta[:, (cumlen + le - 1)] = 0 - for t in np.arange(le-2, -1, -1): - phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t], log_sitewise_transmat[cumlen + t]], [log_sitewise_transmat[cumlen + t], log_sitewise_self_transmat[cumlen + t] ]]) - combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) ) + for t in np.arange(le - 2, -1, -1): + phases_switch_mat = np.array( + [ + [ + log_sitewise_self_transmat[cumlen + t], + log_sitewise_transmat[cumlen + t], + ], + [ + log_sitewise_transmat[cumlen + t], + log_sitewise_self_transmat[cumlen + t], + ], + ] + ) + combined_transmat = np.kron( + np.exp(phases_switch_mat), np.exp(log_transmat) + ) combined_transmat = np.log(combined_transmat) for i in np.arange(log_emission.shape[0]): for j in np.arange(log_emission.shape[0]): - buf[j] = log_beta[j, (cumlen + t + 1)] + combined_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :]) + buf[j] = ( + log_beta[j, (cumlen + t + 1)] + + combined_transmat[i, j] + + np.sum(log_emission[j, (cumlen + t + 1), :]) + ) log_beta[i, (cumlen + t)] = mylogsumexp(buf) cumlen += le return log_beta + # - def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop=None, \ - fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4): - ''' + def run_baum_welch_nb_bb( + self, + X, + lengths, + n_states, + base_nb_mean, + total_bb_RD, + log_sitewise_transmat, + tumor_prop=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + is_diag=False, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + max_iter=100, + tol=1e-4, + ): + """ Input X: size n_observations * n_components * n_spots. lengths: sum of lengths = n_observations. @@ -239,41 +369,84 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, Intermediate log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. alpha: size of n_states. Dispersioon parameter of each HMM state. - ''' + """ n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 # initialize NB logmean shift and BetaBinom prob - log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu - p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom + log_mu = ( + np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T + if init_log_mu is None + else init_log_mu + ) + p_binom = ( + np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T + if init_p_binom is None + else init_p_binom + ) # initialize (inverse of) dispersion param in NB and BetaBinom - alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + alphas = ( + 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas + ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus # initialize start probability and emission probability - log_startprob = np.log( np.ones(n_states) / n_states ) + log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: - transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1) + transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: - log_transmat = np.zeros((1,1)) + log_transmat = np.zeros((1, 1)) # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) - unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean) - unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD) + unique_values_nb, mapping_matrices_nb = construct_unique_matrix( + X[:, 0, :], base_nb_mean + ) + unique_values_bb, mapping_matrices_bb = construct_unique_matrix( + X[:, 1, :], total_bb_RD + ) # EM algorithm for r in trange(max_iter): # E step if tumor_prop is None: - log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission_rdr, log_emission_baf = ( + hmm_sitewise.compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ) + ) log_emission = log_emission_rdr + log_emission_baf else: - log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop) + log_emission_rdr, log_emission_baf = ( + hmm_sitewise.compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + tumor_prop, + ) + ) log_emission = log_emission_rdr + log_emission_baf - log_alpha = hmm_sitewise.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) - log_beta = hmm_sitewise.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_alpha = hmm_sitewise.forward_lattice( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) + log_beta = hmm_sitewise.backward_lattice( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission) + log_xi = compute_posterior_transition_sitewise( + log_alpha, log_beta, log_transmat, log_emission + ) # M step if "s" in self.params: new_log_startprob = update_startprob_sitewise(lengths, log_gamma) @@ -288,32 +461,79 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, # new_log_mu, new_alphas = update_emission_params_nb_sitewise(X[:,0,:], log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \ # fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) if tumor_prop is None: - new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + new_log_mu, new_alphas = ( + update_emission_params_nb_sitewise_uniqvalues( + unique_values_nb, + mapping_matrices_nb, + log_gamma, + base_nb_mean, + alphas, + start_log_mu=log_mu, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + ) + ) else: - new_log_mu, new_alphas = update_emission_params_nb_sitewise_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, base_nb_mean, alphas, tumor_prop, start_log_mu=log_mu, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) + new_log_mu, new_alphas = ( + update_emission_params_nb_sitewise_uniqvalues_mix( + unique_values_nb, + mapping_matrices_nb, + log_gamma, + base_nb_mean, + alphas, + tumor_prop, + start_log_mu=log_mu, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + ) + ) else: new_log_mu = log_mu new_alphas = alphas if "p" in self.params: if tumor_prop is None: - new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + new_p_binom, new_taus = ( + update_emission_params_bb_sitewise_uniqvalues( + unique_values_bb, + mapping_matrices_bb, + log_gamma, + total_bb_RD, + taus, + start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + ) + ) else: - new_p_binom, new_taus = update_emission_params_bb_sitewise_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, total_bb_RD, taus, tumor_prop, start_p_binom=p_binom, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) + new_p_binom, new_taus = ( + update_emission_params_bb_sitewise_uniqvalues_mix( + unique_values_bb, + mapping_matrices_bb, + log_gamma, + total_bb_RD, + taus, + tumor_prop, + start_p_binom=p_binom, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + ) + ) else: new_p_binom = p_binom new_taus = taus # check convergence - print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \ - np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \ - np.mean(np.abs(new_log_mu - log_mu)),\ - np.mean(np.abs(new_p_binom - p_binom)) ) - print( np.hstack([new_log_mu, new_p_binom]) ) - if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \ - np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol: + print( + np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), + np.mean(np.abs(new_log_mu - log_mu)), + np.mean(np.abs(new_p_binom - p_binom)), + ) + print(np.hstack([new_log_mu, new_p_binom])) + if ( + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol + and np.mean(np.abs(new_log_mu - log_mu)) < tol + and np.mean(np.abs(new_p_binom - p_binom)) < tol + ): break log_startprob = new_log_startprob log_transmat = new_log_transmat @@ -321,10 +541,30 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, alphas = new_alphas p_binom = new_p_binom taus = new_taus - return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma + return ( + new_log_mu, + new_alphas, + new_p_binom, + new_taus, + new_log_startprob, + new_log_transmat, + log_gamma, + ) -def posterior_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, log_startprob, log_transmat, log_sitewise_transmat): +def posterior_nb_bb_sitewise( + X, + lengths, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + log_startprob, + log_transmat, + log_sitewise_transmat, +): """ Attributes ---------- @@ -361,15 +601,35 @@ def posterior_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_ log_sitewise_transmat : array, shape (n_observations) Log of phase switch probability of each gene (or bin). """ - log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission_rdr, log_emission_baf = ( + hmm_sitewise.compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ) + ) log_emission = log_emission_rdr + log_emission_baf - log_alpha = hmm_sitewise.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) - log_beta = hmm_sitewise.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_alpha = hmm_sitewise.forward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ) + log_beta = hmm_sitewise.backward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) return log_gamma -def loglikelihood_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, log_startprob, log_transmat, log_sitewise_transmat): +def loglikelihood_nb_bb_sitewise( + X, + lengths, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + log_startprob, + log_transmat, + log_sitewise_transmat, +): """ Attributes ---------- @@ -406,85 +666,150 @@ def loglikelihood_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total log_sitewise_transmat : array, shape (n_observations) Log of phase switch probability of each gene (or bin). """ - log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission_rdr, log_emission_baf = ( + hmm_sitewise.compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ) + ) log_emission = log_emission_rdr + log_emission_baf - log_alpha = hmm_sitewise.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) - return np.sum(scipy.special.logsumexp(log_alpha[:,np.cumsum(lengths)-1], axis=0)), log_alpha - - -def viterbi_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, log_startprob, log_transmat, log_sitewise_transmat): - ''' - Input - X: size n_observations * n_components * n_spots. - lengths: sum of lengths = n_observations. - exposures: size of n_observations * n_spots. - base_prob: size of n_observations. The expression probability derived from normal spots. - log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. - alpha: size of n_states. Dispersioon parameter of each HMM state. - log_transmat: n_states * n_states. Transition probability after log transformation. - log_startprob: n_states. Start probability after log transformation. - Output -# log_prob: a scalar. - labels: size of n_observations. - Intermediate - log_emission: n_states * n_observations * n_spots. Log probability. - log_v: n_states * n_observations per chromosome. Log of viterbi DP table. v[i,t] = max_{q_1, ..., q_{t-1}} P(o_1, q_1, ..., o_{t-1}, q_{t-1}, o_t, q_t=i | lambda). - ''' + log_alpha = hmm_sitewise.forward_lattice( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat + ) + return ( + np.sum(scipy.special.logsumexp(log_alpha[:, np.cumsum(lengths) - 1], axis=0)), + log_alpha, + ) + + +def viterbi_nb_bb_sitewise( + X, + lengths, + base_nb_mean, + log_mu, + alphas, + total_bb_RD, + p_binom, + taus, + log_startprob, + log_transmat, + log_sitewise_transmat, +): + """ + Input + X: size n_observations * n_components * n_spots. + lengths: sum of lengths = n_observations. + exposures: size of n_observations * n_spots. + base_prob: size of n_observations. The expression probability derived from normal spots. + log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. + alpha: size of n_states. Dispersioon parameter of each HMM state. + log_transmat: n_states * n_states. Transition probability after log transformation. + log_startprob: n_states. Start probability after log transformation. + Output + # log_prob: a scalar. + labels: size of n_observations. + Intermediate + log_emission: n_states * n_observations * n_spots. Log probability. + log_v: n_states * n_observations per chromosome. Log of viterbi DP table. v[i,t] = max_{q_1, ..., q_{t-1}} P(o_1, q_1, ..., o_{t-1}, q_{t-1}, o_t, q_t=i | lambda). + """ n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] n_states = log_transmat.shape[0] log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat)) - log_emission_rdr, log_emission_baf = hmm_sitewise.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) + log_emission_rdr, log_emission_baf = ( + hmm_sitewise.compute_emission_probability_nb_betabinom( + X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus + ) + ) log_emission = log_emission_rdr + log_emission_baf # initialize viterbi DP table and backtracking table labels = np.array([]) merged_labels = np.array([]) cumlen = 0 for le in lengths: - log_v = np.zeros((2*n_states, le)) - bt = np.zeros((2*n_states, le)) + log_v = np.zeros((2 * n_states, le)) + bt = np.zeros((2 * n_states, le)) for t in np.arange(le): if cumlen == 0 and t == 0: - log_v[:, 0] = np.mean(log_emission[:,0,:], axis=1) + np.append(log_startprob,log_startprob) + np.log(0.5) + log_v[:, 0] = ( + np.mean(log_emission[:, 0, :], axis=1) + + np.append(log_startprob, log_startprob) + + np.log(0.5) + ) continue - for i in np.arange(2*n_states): + for i in np.arange(2 * n_states): if t > 0: - tmp = log_v[:, (t-1)] + np.append(log_transmat[:,i - n_states * int(i/n_states)], log_transmat[:,i - n_states * int(i/n_states)]) + np.sum(log_emission[i, (cumlen+t), :]) + tmp = ( + log_v[:, (t - 1)] + + np.append( + log_transmat[:, i - n_states * int(i / n_states)], + log_transmat[:, i - n_states * int(i / n_states)], + ) + + np.sum(log_emission[i, (cumlen + t), :]) + ) else: - tmp = np.append(log_startprob[i - n_states * int(i/n_states)], log_startprob[i - n_states * int(i/n_states)]) + np.sum(log_emission[i, (cumlen+t), :]) + tmp = np.append( + log_startprob[i - n_states * int(i / n_states)], + log_startprob[i - n_states * int(i / n_states)], + ) + np.sum(log_emission[i, (cumlen + t), :]) bt[i, t] = np.argmax(tmp) log_v[i, t] = np.max(tmp) # backtracking to get the sequence - chr_labels = [ np.argmax(log_v[:,-1]) ] - + chr_labels = [np.argmax(log_v[:, -1])] + if cumlen == 0: - for t2 in np.arange(le-1, 0, -1): - chr_labels.append( int(bt[chr_labels[-1],t2])) + for t2 in np.arange(le - 1, 0, -1): + chr_labels.append(int(bt[chr_labels[-1], t2])) else: - for t2 in np.arange(le-2, -1, -1): - chr_labels.append( int(bt[chr_labels[-1],t2])) + for t2 in np.arange(le - 2, -1, -1): + chr_labels.append(int(bt[chr_labels[-1], t2])) chr_labels = np.array(chr_labels[::-1]).astype(int) # merge two phases chr_merged_labels = copy.copy(chr_labels) - chr_merged_labels[chr_merged_labels >= n_states] = chr_merged_labels[chr_merged_labels >= n_states] - n_states - + chr_merged_labels[chr_merged_labels >= n_states] = ( + chr_merged_labels[chr_merged_labels >= n_states] - n_states + ) + if cumlen == 0: labels = chr_labels merged_labels = chr_merged_labels else: labels = np.append(labels, chr_labels) merged_labels = np.append(merged_labels, chr_merged_labels) - + cumlen += le return labels, merged_labels -def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop=None, \ - hmmclass=hmm_sitewise, params="smp", t=1-1e-6, random_state=0, \ - in_log_space=True, only_minor=False, fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \ - init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, is_diag=True, max_iter=100, tol=1e-4, **kwargs): +def pipeline_baum_welch( + output_prefix, + X, + lengths, + n_states, + base_nb_mean, + total_bb_RD, + log_sitewise_transmat, + tumor_prop=None, + hmmclass=hmm_sitewise, + params="smp", + t=1 - 1e-6, + random_state=0, + in_log_space=True, + only_minor=False, + fix_NB_dispersion=False, + shared_NB_dispersion=True, + fix_BB_dispersion=False, + shared_BB_dispersion=True, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + is_diag=True, + max_iter=100, + tol=1e-4, + **kwargs, +): """ tumor_prop : array, (n_obs, n_spots) Probability of sequencing a tumor read. (tumor cell proportion weighted by ploidy) @@ -492,15 +817,26 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total """ # initialization n_spots = X.shape[2] - if ((init_log_mu is None) and ("m" in params)) or ((init_p_binom is None) and ("p" in params)): - tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=random_state, in_log_space=in_log_space, only_minor=only_minor) + if ((init_log_mu is None) and ("m" in params)) or ( + (init_p_binom is None) and ("p" in params) + ): + tmp_log_mu, tmp_p_binom = initialization_by_gmm( + n_states, + X, + base_nb_mean, + total_bb_RD, + params, + random_state=random_state, + in_log_space=in_log_space, + only_minor=only_minor, + ) if (init_log_mu is None) and ("m" in params): init_log_mu = tmp_log_mu if (init_p_binom is None) and ("p" in params): init_p_binom = tmp_p_binom print(f"init_log_mu = {init_log_mu}") print(f"init_p_binom = {init_p_binom}") - + # fit HMM-NB-BetaBinom # new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb(X, lengths, \ # n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop, \ @@ -509,34 +845,122 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total # is_diag=is_diag, init_log_mu=init_log_mu, init_p_binom=init_p_binom, init_alphas=init_alphas, init_taus=init_taus, \ # max_iter=max_iter, tol=tol) hmmmodel = hmmclass(params=params, t=t) - remain_kwargs = {k:v for k,v in kwargs.items() if k in ["lambd", "sample_length", "log_gamma"]} - new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma = hmmmodel.run_baum_welch_nb_bb(X, lengths, \ - n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ - is_diag=is_diag, init_log_mu=init_log_mu, init_p_binom=init_p_binom, init_alphas=init_alphas, init_taus=init_taus, \ - max_iter=max_iter, tol=tol, **remain_kwargs) + remain_kwargs = { + k: v for k, v in kwargs.items() if k in ["lambd", "sample_length", "log_gamma"] + } + ( + new_log_mu, + new_alphas, + new_p_binom, + new_taus, + new_log_startprob, + new_log_transmat, + log_gamma, + ) = hmmmodel.run_baum_welch_nb_bb( + X, + lengths, + n_states, + base_nb_mean, + total_bb_RD, + log_sitewise_transmat, + tumor_prop, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=is_diag, + init_log_mu=init_log_mu, + init_p_binom=init_p_binom, + init_alphas=init_alphas, + init_taus=init_taus, + max_iter=max_iter, + tol=tol, + **remain_kwargs, + ) # likelihood if tumor_prop is None: - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + X, + base_nb_mean, + new_log_mu, + new_alphas, + total_bb_RD, + new_p_binom, + new_taus, + ) + ) log_emission = log_emission_rdr + log_emission_baf else: if ("m" in params) and ("sample_length" in kwargs): logmu_shift = [] for c in range(len(kwargs["sample_length"])): - this_pred_cnv = np.argmax(log_gamma[:,np.sum(kwargs["sample_length"][:c]):np.sum(kwargs["sample_length"][:(c+1)])], axis=0)%n_states - logmu_shift.append( scipy.special.logsumexp(new_log_mu[this_pred_cnv,:] + np.log(kwargs["lambd"]).reshape(-1,1), axis=0) ) + this_pred_cnv = ( + np.argmax( + log_gamma[ + :, + np.sum(kwargs["sample_length"][:c]) : np.sum( + kwargs["sample_length"][: (c + 1)] + ), + ], + axis=0, + ) + % n_states + ) + logmu_shift.append( + scipy.special.logsumexp( + new_log_mu[this_pred_cnv, :] + + np.log(kwargs["lambd"]).reshape(-1, 1), + axis=0, + ) + ) logmu_shift = np.vstack(logmu_shift) - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus, tumor_prop, logmu_shift=logmu_shift, sample_length=kwargs["sample_length"]) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + new_log_mu, + new_alphas, + total_bb_RD, + new_p_binom, + new_taus, + tumor_prop, + logmu_shift=logmu_shift, + sample_length=kwargs["sample_length"], + ) + ) else: - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus, tumor_prop) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + X, + base_nb_mean, + new_log_mu, + new_alphas, + total_bb_RD, + new_p_binom, + new_taus, + tumor_prop, + ) + ) # log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, new_log_mu, new_alphas, total_bb_RD, new_p_binom, new_taus, tumor_prop) log_emission = log_emission_rdr + log_emission_baf - log_alpha = hmmclass.forward_lattice(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat) - llf = np.sum(scipy.special.logsumexp(log_alpha[:,np.cumsum(lengths)-1], axis=0)) + log_alpha = hmmclass.forward_lattice( + lengths, + new_log_transmat, + new_log_startprob, + log_emission, + log_sitewise_transmat, + ) + llf = np.sum(scipy.special.logsumexp(log_alpha[:, np.cumsum(lengths) - 1], axis=0)) - log_beta = hmmclass.backward_lattice(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat) + log_beta = hmmclass.backward_lattice( + lengths, + new_log_transmat, + new_log_startprob, + log_emission, + log_sitewise_transmat, + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) pred = np.argmax(log_gamma, axis=0) pred_cnv = pred % n_states @@ -544,22 +968,48 @@ def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total # save results if not output_prefix is None: tmp = np.log10(1 - t) - np.savez(f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz", \ - new_log_mu=new_log_mu, new_alphas=new_alphas, new_p_binom=new_p_binom, new_taus=new_taus, \ - new_log_startprob=new_log_startprob, new_log_transmat=new_log_transmat, log_gamma=log_gamma, pred_cnv=pred_cnv, llf=llf) + np.savez( + f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz", + new_log_mu=new_log_mu, + new_alphas=new_alphas, + new_p_binom=new_p_binom, + new_taus=new_taus, + new_log_startprob=new_log_startprob, + new_log_transmat=new_log_transmat, + log_gamma=log_gamma, + pred_cnv=pred_cnv, + llf=llf, + ) else: - res = {"new_log_mu":new_log_mu, "new_alphas":new_alphas, "new_p_binom":new_p_binom, "new_taus":new_taus, \ - "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf} + res = { + "new_log_mu": new_log_mu, + "new_alphas": new_alphas, + "new_p_binom": new_p_binom, + "new_taus": new_taus, + "new_log_startprob": new_log_startprob, + "new_log_transmat": new_log_transmat, + "log_gamma": log_gamma, + "pred_cnv": pred_cnv, + "llf": llf, + } return res -def eval_neymanpearson_bafonly(log_emission_baf_c1, pred_c1, log_emission_baf_c2, pred_c2, bidx, n_states, res, p): - assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states +def eval_neymanpearson_bafonly( + log_emission_baf_c1, pred_c1, log_emission_baf_c2, pred_c2, bidx, n_states, res, p +): + assert ( + log_emission_baf_c1.shape[0] == n_states + or log_emission_baf_c1.shape[0] == 2 * n_states + ) # likelihood under the corresponding state - llf_original = np.append(log_emission_baf_c1[pred_c1[bidx], bidx], log_emission_baf_c2[pred_c2[bidx], bidx]).reshape(-1,1) + llf_original = np.append( + log_emission_baf_c1[pred_c1[bidx], bidx], + log_emission_baf_c2[pred_c2[bidx], bidx], + ).reshape(-1, 1) # likelihood under the switched state if log_emission_baf_c1.shape[0] == 2 * n_states: - if (res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5): + if (res["new_p_binom"][p[0], 0] > 0.5) == (res["new_p_binom"][p[1], 0] > 0.5): switch_pred_c1 = n_states * (pred_c1 >= n_states) + (pred_c2 % n_states) switch_pred_c2 = n_states * (pred_c2 >= n_states) + (pred_c1 % n_states) else: @@ -568,19 +1018,40 @@ def eval_neymanpearson_bafonly(log_emission_baf_c1, pred_c1, log_emission_baf_c2 else: switch_pred_c1 = pred_c2 switch_pred_c2 = pred_c1 - llf_switch = np.append(log_emission_baf_c1[switch_pred_c1[bidx], bidx], log_emission_baf_c2[switch_pred_c2[bidx], bidx]).reshape(-1,1) + llf_switch = np.append( + log_emission_baf_c1[switch_pred_c1[bidx], bidx], + log_emission_baf_c2[switch_pred_c2[bidx], bidx], + ).reshape(-1, 1) # log likelihood difference return np.mean(llf_original) - np.mean(llf_switch) -def eval_neymanpearson_rdrbaf(log_emission_rdr_c1, log_emission_baf_c1, pred_c1, log_emission_rdr_c2, log_emission_baf_c2, pred_c2, bidx, n_states, res, p): - assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states +def eval_neymanpearson_rdrbaf( + log_emission_rdr_c1, + log_emission_baf_c1, + pred_c1, + log_emission_rdr_c2, + log_emission_baf_c2, + pred_c2, + bidx, + n_states, + res, + p, +): + assert ( + log_emission_baf_c1.shape[0] == n_states + or log_emission_baf_c1.shape[0] == 2 * n_states + ) # likelihood under the corresponding state - llf_original = np.append(log_emission_rdr_c1[pred_c1[bidx], bidx] + log_emission_baf_c1[pred_c1[bidx], bidx], \ - log_emission_rdr_c2[pred_c2[bidx], bidx] + log_emission_baf_c2[pred_c2[bidx], bidx]).reshape(-1,1) + llf_original = np.append( + log_emission_rdr_c1[pred_c1[bidx], bidx] + + log_emission_baf_c1[pred_c1[bidx], bidx], + log_emission_rdr_c2[pred_c2[bidx], bidx] + + log_emission_baf_c2[pred_c2[bidx], bidx], + ).reshape(-1, 1) # likelihood under the switched state if log_emission_baf_c1.shape[0] == 2 * n_states: - if (res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5): + if (res["new_p_binom"][p[0], 0] > 0.5) == (res["new_p_binom"][p[1], 0] > 0.5): switch_pred_c1 = n_states * (pred_c1 >= n_states) + (pred_c2 % n_states) switch_pred_c2 = n_states * (pred_c2 >= n_states) + (pred_c1 % n_states) else: @@ -589,162 +1060,378 @@ def eval_neymanpearson_rdrbaf(log_emission_rdr_c1, log_emission_baf_c1, pred_c1, else: switch_pred_c1 = pred_c2 switch_pred_c2 = pred_c1 - llf_switch = np.append(log_emission_rdr_c1[switch_pred_c1[bidx], bidx] + log_emission_baf_c1[switch_pred_c1[bidx], bidx], \ - log_emission_rdr_c2[switch_pred_c2[bidx], bidx] + log_emission_baf_c2[switch_pred_c2[bidx], bidx]).reshape(-1,1) + llf_switch = np.append( + log_emission_rdr_c1[switch_pred_c1[bidx], bidx] + + log_emission_baf_c1[switch_pred_c1[bidx], bidx], + log_emission_rdr_c2[switch_pred_c2[bidx], bidx] + + log_emission_baf_c2[switch_pred_c2[bidx], bidx], + ).reshape(-1, 1) # log likelihood difference return np.mean(llf_original) - np.mean(llf_switch) -def compute_neymanpearson_stats(X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass): +def compute_neymanpearson_stats( + X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass +): n_obs = X.shape[0] n_states = res["new_p_binom"].shape[0] n_clones = X.shape[2] lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean) # if tumor_prop is None: - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ - total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"]) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + base_nb_mean.flatten("F").reshape(-1, 1), + res["new_log_mu"], + res["new_alphas"], + total_bb_RD.flatten("F").reshape(-1, 1), + res["new_p_binom"], + res["new_taus"], + ) + ) else: if "m" in params: logmu_shift = [] for c in range(n_clones): - this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states - logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) ) + this_pred_cnv = ( + np.argmax( + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], axis=0 + ) + % n_states + ) + logmu_shift.append( + scipy.special.logsumexp( + res["new_log_mu"][this_pred_cnv, :] + + np.log(lambd).reshape(-1, 1), + axis=0, + ) + ) logmu_shift = np.vstack(logmu_shift) - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ - total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop, logmu_shift=logmu_shift, sample_length=np.ones(n_clones,dtype=int)*n_obs) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.vstack( + [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")] + ).T.reshape(-1, 2, 1), + base_nb_mean.flatten("F").reshape(-1, 1), + res["new_log_mu"], + res["new_alphas"], + total_bb_RD.flatten("F").reshape(-1, 1), + res["new_p_binom"], + res["new_taus"], + tumor_prop, + logmu_shift=logmu_shift, + sample_length=np.ones(n_clones, dtype=int) * n_obs, + ) + ) else: - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ - total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop) - log_emission_rdr = log_emission_rdr.reshape((log_emission_rdr.shape[0], n_obs, n_clones), order="F") - log_emission_baf = log_emission_baf.reshape((log_emission_baf.shape[0], n_obs, n_clones), order="F") - reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1)) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.vstack( + [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")] + ).T.reshape(-1, 2, 1), + base_nb_mean.flatten("F").reshape(-1, 1), + res["new_log_mu"], + res["new_alphas"], + total_bb_RD.flatten("F").reshape(-1, 1), + res["new_p_binom"], + res["new_taus"], + tumor_prop, + ) + ) + log_emission_rdr = log_emission_rdr.reshape( + (log_emission_rdr.shape[0], n_obs, n_clones), order="F" + ) + log_emission_baf = log_emission_baf.reshape( + (log_emission_baf.shape[0], n_obs, n_clones), order="F" + ) + reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states - all_test_statistics = {(c1, c2):[] for c1 in range(n_clones) for c2 in range(c1+1, n_clones)} + all_test_statistics = { + (c1, c2): [] for c1 in range(n_clones) for c2 in range(c1 + 1, n_clones) + } for c1 in range(n_clones): - for c2 in range(c1+1, n_clones): + for c2 in range(c1 + 1, n_clones): # unmergeable_bincount = 0 - unique_pair_states = [x for x in np.unique(reshaped_pred_cnv[np.array([c1,c2]), :], axis=1).T if x[0] != x[1]] + unique_pair_states = [ + x + for x in np.unique(reshaped_pred_cnv[np.array([c1, c2]), :], axis=1).T + if x[0] != x[1] + ] list_t_neymanpearson = [] for p in unique_pair_states: - bidx = np.where( (reshaped_pred_cnv[c1,:]==p[0]) & (reshaped_pred_cnv[c2,:]==p[1]) )[0] + bidx = np.where( + (reshaped_pred_cnv[c1, :] == p[0]) + & (reshaped_pred_cnv[c2, :] == p[1]) + )[0] if "m" in params and "p" in params: - t_neymanpearson = eval_neymanpearson_rdrbaf(log_emission_rdr[:,:,c1], log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_rdr[:,:,c2], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p) + t_neymanpearson = eval_neymanpearson_rdrbaf( + log_emission_rdr[:, :, c1], + log_emission_baf[:, :, c1], + reshaped_pred[c1, :], + log_emission_rdr[:, :, c2], + log_emission_baf[:, :, c2], + reshaped_pred[c2, :], + bidx, + n_states, + res, + p, + ) elif "p" in params: - t_neymanpearson = eval_neymanpearson_bafonly(log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p) - all_test_statistics[(c1, c2)].append( (p[0], p[1], t_neymanpearson) ) - + t_neymanpearson = eval_neymanpearson_bafonly( + log_emission_baf[:, :, c1], + reshaped_pred[c1, :], + log_emission_baf[:, :, c2], + reshaped_pred[c2, :], + bidx, + n_states, + res, + p, + ) + all_test_statistics[(c1, c2)].append((p[0], p[1], t_neymanpearson)) + return all_test_statistics -def similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, threshold=2.0, minlength=10, topk=10, params="smp", tumor_prop=None, hmmclass=hmm_sitewise, **kwargs): +def similarity_components_rdrbaf_neymanpearson( + X, + base_nb_mean, + total_bb_RD, + res, + threshold=2.0, + minlength=10, + topk=10, + params="smp", + tumor_prop=None, + hmmclass=hmm_sitewise, + **kwargs, +): n_obs = X.shape[0] n_states = res["new_p_binom"].shape[0] n_clones = X.shape[2] G = nx.Graph() - G.add_nodes_from( np.arange(n_clones) ) + G.add_nodes_from(np.arange(n_clones)) # lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean) # if tumor_prop is None: - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ - total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"]) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + base_nb_mean.flatten("F").reshape(-1, 1), + res["new_log_mu"], + res["new_alphas"], + total_bb_RD.flatten("F").reshape(-1, 1), + res["new_p_binom"], + res["new_taus"], + ) + ) else: if "m" in params: logmu_shift = [] for c in range(n_clones): - this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states - logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) ) + this_pred_cnv = ( + np.argmax( + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], axis=0 + ) + % n_states + ) + logmu_shift.append( + scipy.special.logsumexp( + res["new_log_mu"][this_pred_cnv, :] + + np.log(lambd).reshape(-1, 1), + axis=0, + ) + ) logmu_shift = np.vstack(logmu_shift) - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ - total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop, logmu_shift=logmu_shift, sample_length=np.ones(n_clones,dtype=int)*n_obs) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.vstack( + [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")] + ).T.reshape(-1, 2, 1), + base_nb_mean.flatten("F").reshape(-1, 1), + res["new_log_mu"], + res["new_alphas"], + total_bb_RD.flatten("F").reshape(-1, 1), + res["new_p_binom"], + res["new_taus"], + tumor_prop, + logmu_shift=logmu_shift, + sample_length=np.ones(n_clones, dtype=int) * n_obs, + ) + ) else: - log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ - total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop) - log_emission_rdr = log_emission_rdr.reshape((log_emission_rdr.shape[0], n_obs, n_clones), order="F") - log_emission_baf = log_emission_baf.reshape((log_emission_baf.shape[0], n_obs, n_clones), order="F") - reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1)) + log_emission_rdr, log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.vstack( + [X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")] + ).T.reshape(-1, 2, 1), + base_nb_mean.flatten("F").reshape(-1, 1), + res["new_log_mu"], + res["new_alphas"], + total_bb_RD.flatten("F").reshape(-1, 1), + res["new_p_binom"], + res["new_taus"], + tumor_prop, + ) + ) + log_emission_rdr = log_emission_rdr.reshape( + (log_emission_rdr.shape[0], n_obs, n_clones), order="F" + ) + log_emission_baf = log_emission_baf.reshape( + (log_emission_baf.shape[0], n_obs, n_clones), order="F" + ) + reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states all_test_statistics = [] for c1 in range(n_clones): - for c2 in range(c1+1, n_clones): + for c2 in range(c1 + 1, n_clones): # unmergeable_bincount = 0 - unique_pair_states = [x for x in np.unique(reshaped_pred_cnv[np.array([c1,c2]), :], axis=1).T if x[0] != x[1]] + unique_pair_states = [ + x + for x in np.unique(reshaped_pred_cnv[np.array([c1, c2]), :], axis=1).T + if x[0] != x[1] + ] list_t_neymanpearson = [] for p in unique_pair_states: - bidx = np.where( (reshaped_pred_cnv[c1,:]==p[0]) & (reshaped_pred_cnv[c2,:]==p[1]) )[0] + bidx = np.where( + (reshaped_pred_cnv[c1, :] == p[0]) + & (reshaped_pred_cnv[c2, :] == p[1]) + )[0] if "m" in params and "p" in params: - t_neymanpearson = eval_neymanpearson_rdrbaf(log_emission_rdr[:,:,c1], log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_rdr[:,:,c2], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p) + t_neymanpearson = eval_neymanpearson_rdrbaf( + log_emission_rdr[:, :, c1], + log_emission_baf[:, :, c1], + reshaped_pred[c1, :], + log_emission_rdr[:, :, c2], + log_emission_baf[:, :, c2], + reshaped_pred[c2, :], + bidx, + n_states, + res, + p, + ) elif "p" in params: - t_neymanpearson = eval_neymanpearson_bafonly(log_emission_baf[:,:,c1], reshaped_pred[c1,:], log_emission_baf[:,:,c2], reshaped_pred[c2,:], bidx, n_states, res, p) + t_neymanpearson = eval_neymanpearson_bafonly( + log_emission_baf[:, :, c1], + reshaped_pred[c1, :], + log_emission_baf[:, :, c2], + reshaped_pred[c2, :], + bidx, + n_states, + res, + p, + ) print(c1, c2, p, len(bidx), t_neymanpearson) - all_test_statistics.append( [c1, c2, p, t_neymanpearson] ) + all_test_statistics.append([c1, c2, p, t_neymanpearson]) if len(bidx) >= minlength: list_t_neymanpearson.append(t_neymanpearson) - if len(list_t_neymanpearson) == 0 or np.max(list_t_neymanpearson) < threshold: - max_v = np.max(list_t_neymanpearson) if len(list_t_neymanpearson) > 0 else 1e-3 - G.add_weighted_edges_from([ (c1, c2, max_v) ]) + if ( + len(list_t_neymanpearson) == 0 + or np.max(list_t_neymanpearson) < threshold + ): + max_v = ( + np.max(list_t_neymanpearson) + if len(list_t_neymanpearson) > 0 + else 1e-3 + ) + G.add_weighted_edges_from([(c1, c2, max_v)]) # maximal cliques cliques = [] for x in nx.find_cliques(G): this_len = len(x) - this_weights = np.sum([G.get_edge_data(a,b)["weight"] for a in x for b in x if a != b]) / 2 - cliques.append( (x, this_len, this_weights) ) - cliques.sort(key = lambda x:(-x[1],x[2]) ) + this_weights = ( + np.sum([G.get_edge_data(a, b)["weight"] for a in x for b in x if a != b]) + / 2 + ) + cliques.append((x, this_len, this_weights)) + cliques.sort(key=lambda x: (-x[1], x[2])) covered_nodes = set() merging_groups = [] for c in cliques: if len(set(c[0]) & covered_nodes) == 0: - merging_groups.append( list(c[0]) ) + merging_groups.append(list(c[0])) covered_nodes = covered_nodes | set(c[0]) for c in range(n_clones): if not (c in covered_nodes): - merging_groups.append( [c] ) + merging_groups.append([c]) covered_nodes.add(c) - merging_groups.sort(key = lambda x:np.min(x)) + merging_groups.sort(key=lambda x: np.min(x)) # clone assignment after merging map_clone_id = {} - for i,x in enumerate(merging_groups): + for i, x in enumerate(merging_groups): for z in x: map_clone_id[z] = i new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) merged_res = copy.copy(res) merged_res["new_assignment"] = new_assignment merged_res["total_llf"] = np.NAN - merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) - merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) + merged_res["pred_cnv"] = np.concatenate( + [ + res["pred_cnv"][(c[0] * n_obs) : (c[0] * n_obs + n_obs)] + for c in merging_groups + ] + ) + merged_res["log_gamma"] = np.hstack( + [ + res["log_gamma"][:, (c[0] * n_obs) : (c[0] * n_obs + n_obs)] + for c in merging_groups + ] + ) return merging_groups, merged_res -def combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, res, params="smp", tumor_prop=None, hmmclass=hmm_sitewise, merge_threshold=0.1, **kwargs): +def combine_similar_states_across_clones( + X, + base_nb_mean, + total_bb_RD, + res, + params="smp", + tumor_prop=None, + hmmclass=hmm_sitewise, + merge_threshold=0.1, + **kwargs, +): n_clones = X.shape[2] n_obs = X.shape[0] n_states = res["new_p_binom"].shape[0] - reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1)) + reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states # - all_test_statistics = compute_neymanpearson_stats(X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass) + all_test_statistics = compute_neymanpearson_stats( + X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass + ) # make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small for c1 in range(n_clones): - for c2 in range(c1+1, n_clones): + for c2 in range(c1 + 1, n_clones): list_t_neymanpearson = all_test_statistics[(c1, c2)] for p1, p2, t_neymanpearson in list_t_neymanpearson: if t_neymanpearson < merge_threshold: - c_keep = c1 if np.sum(total_bb_RD[:,c1]) > np.sum(total_bb_RD[:,c2]) else c2 + c_keep = ( + c1 + if np.sum(total_bb_RD[:, c1]) > np.sum(total_bb_RD[:, c2]) + else c2 + ) c_change = c2 if c_keep == c1 else c1 - bidx = np.where( (reshaped_pred_cnv[c1,:]==p1) & (reshaped_pred_cnv[c2,:]==p2) )[0] - res['pred_cnv'][(c_change*n_obs):(c_change*n_obs+n_obs)][bidx] = res['pred_cnv'][(c_keep*n_obs):(c_keep*n_obs+n_obs)][bidx] - print(f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}") + bidx = np.where( + (reshaped_pred_cnv[c1, :] == p1) + & (reshaped_pred_cnv[c2, :] == p2) + )[0] + res["pred_cnv"][(c_change * n_obs) : (c_change * n_obs + n_obs)][ + bidx + ] = res["pred_cnv"][(c_keep * n_obs) : (c_keep * n_obs + n_obs)][ + bidx + ] + print( + f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}" + ) return res - # def similarity_components_rdrbaf_neymanpearson_posterior(X, base_nb_mean, total_bb_RD, res, threshold=2.0, minlength=10, topk=10, params="smp", tumor_prop=None, hmmclass=hmm_sitewise): # n_obs = X.shape[0] # n_states = res["new_p_binom"].shape[0] @@ -755,7 +1442,7 @@ def combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, res, para # def eval_neymanpearson_bafonly(log_emission_baf_c1, log_gamma_c1, log_emission_baf_c2, log_gamma_c2, bidx, n_states, res, p): # assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states # # likelihood under the corresponding state -# llf_original = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + log_gamma_c1[:, bidx], axis=0), +# llf_original = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + log_gamma_c1[:, bidx], axis=0), # scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + log_gamma_c2[:, bidx], axis=0)) # # likelihood under the switched state # if log_emission_baf_c1.shape[0] == 2 * n_states: @@ -773,7 +1460,7 @@ def combine_similar_states_across_clones(X, base_nb_mean, total_bb_RD, res, para # else: # switch_log_gamma_c1 = log_gamma_c2 # switch_log_gamma_c2 = log_gamma_c1 -# llf_switch = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0), +# llf_switch = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0), # scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + switch_log_gamma_c2[:, bidx], axis=0)) # # log likelihood difference # return np.mean(llf_original) - np.mean(llf_switch) diff --git a/src/calicost/hmm_NB_sharedstates.py b/src/calicost/hmm_NB_sharedstates.py index a265810..8722ef3 100644 --- a/src/calicost/hmm_NB_sharedstates.py +++ b/src/calicost/hmm_NB_sharedstates.py @@ -20,8 +20,8 @@ def convert_params(mean, std): See https://mathworld.wolfram.com/NegativeBinomialDistribution.html """ - p = mean/std**2 - n = mean*p/(1.0 - p) + p = mean / std**2 + n = mean * p / (1.0 - p) return n, p @@ -31,6 +31,7 @@ def __init__(self, endog, exog, weights, exposure, seed=0, **kwds): self.weights = weights self.exposure = exposure self.seed = seed + # def nloglikeobs(self, params): nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure @@ -39,19 +40,20 @@ def nloglikeobs(self, params): llf = scipy.stats.nbinom.logpmf(self.endog, n, p) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): - self.exog_names.append('alpha') + self.exog_names.append("alpha") if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - - return super(Weighted_NegativeBinomial, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + + return super(Weighted_NegativeBinomial, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class ConstrainedNBHMM(BaseHMM): @@ -84,18 +86,34 @@ class ConstrainedNBHMM(BaseHMM): hmmmodel.fit( X ) hmmmodel.predict( X ) """ - def __init__(self, n_components=1, shared_dispersion=False, - startprob_prior=1.0, transmat_prior=1.0, - algorithm="viterbi", random_state=None, - n_iter=10, tol=1e-2, verbose=False, - params="stma", - init_params=""): - BaseHMM.__init__(self, n_components, - startprob_prior=startprob_prior, - transmat_prior=transmat_prior, algorithm=algorithm, - random_state=random_state, n_iter=n_iter, - tol=tol, params=params, verbose=verbose, - init_params=init_params) + + def __init__( + self, + n_components=1, + shared_dispersion=False, + startprob_prior=1.0, + transmat_prior=1.0, + algorithm="viterbi", + random_state=None, + n_iter=10, + tol=1e-2, + verbose=False, + params="stma", + init_params="", + ): + BaseHMM.__init__( + self, + n_components, + startprob_prior=startprob_prior, + transmat_prior=transmat_prior, + algorithm=algorithm, + random_state=random_state, + n_iter=n_iter, + tol=tol, + params=params, + verbose=verbose, + init_params=init_params, + ) self.shared_dispersion = shared_dispersion # initialize CNV's effect self.log_mu = np.linspace(-0.1, 0.1, self.n_components) @@ -105,8 +123,13 @@ def __init__(self, n_components=1, shared_dispersion=False, # initialize start probability and transition probability self.startprob_ = np.ones(self.n_components) / self.n_components t = 0.9 - self.transmat_ = np.ones((self.n_components, self.n_components)) * (1-t) / (self.n_components-1) + self.transmat_ = ( + np.ones((self.n_components, self.n_components)) + * (1 - t) + / (self.n_components - 1) + ) np.fill_diagonal(self.transmat_, t) + # def _compute_log_likelihood(self, X): """ @@ -131,16 +154,21 @@ def _compute_log_likelihood(self, X): nb_std = np.sqrt(nb_mean + self.alphas[i] * nb_mean**2) # nb_std = np.sqrt(nb_mean + self.alphas[i,:].reshape(-1,1) * nb_mean**2) n, p = convert_params(nb_mean, nb_std) - log_prob[:,:,i] = scipy.stats.nbinom.logpmf(X[:, :n_cells], n, p) + log_prob[:, :, i] = scipy.stats.nbinom.logpmf(X[:, :n_cells], n, p) return log_prob.mean(axis=1) + # def _initialize_sufficient_statistics(self): stats = super()._initialize_sufficient_statistics() return stats + # - def _accumulate_sufficient_statistics(self, stats, X, lattice, posteriors, fwdlattice, bwdlattice): + def _accumulate_sufficient_statistics( + self, stats, X, lattice, posteriors, fwdlattice, bwdlattice + ): super()._accumulate_sufficient_statistics( - stats, X, lattice, posteriors, fwdlattice, bwdlattice) + stats, X, lattice, posteriors, fwdlattice, bwdlattice + ) """ Update sufficient statistics from a given sample. Parameters @@ -160,62 +188,85 @@ def _accumulate_sufficient_statistics(self, stats, X, lattice, posteriors, fwdla fwdlattice, bwdlattice : array, shape (n_genes, n_components) forward and backward probabilities. """ - if 'm' in self.params or 'a' in self.params: - stats['post'] = posteriors - stats['obs'] = X - if 't' in self.params: + if "m" in self.params or "a" in self.params: + stats["post"] = posteriors + stats["obs"] = X + if "t" in self.params: # for each ij, recover sum_t xi_ij from the inferred transition matrix bothlattice = fwdlattice + bwdlattice - loggamma = (bothlattice.T - logsumexp(bothlattice, axis = 1)).T + loggamma = (bothlattice.T - logsumexp(bothlattice, axis=1)).T # denominator for each ij is the sum of gammas over i - denoms = np.sum(np.exp(loggamma), axis = 0) + denoms = np.sum(np.exp(loggamma), axis=0) # transpose to perform row-wise multiplication - stats['denoms'] = denoms + stats["denoms"] = denoms + # def _do_mstep(self, stats): - n_genes = stats['obs'].shape[0] - n_cells = int(stats['obs'].shape[1] / 2) - base_nb_mean = stats['obs'][:, n_cells:] + n_genes = stats["obs"].shape[0] + n_cells = int(stats["obs"].shape[1] / 2) + base_nb_mean = stats["obs"][:, n_cells:] super()._do_mstep(stats) - if 'm' in self.params and 'a' in self.params: + if "m" in self.params and "a" in self.params: # NB regression fit dispersion and CNV's effect simultaneously if not self.shared_dispersion: for i in range(self.n_components): - model = Weighted_NegativeBinomial(stats['obs'][:, :n_cells].flatten(), \ - np.ones(n_genes*n_cells).reshape(-1,1), \ - weights=np.repeat(stats['post'][:,i], n_cells), exposure=base_nb_mean.flatten()) + model = Weighted_NegativeBinomial( + stats["obs"][:, :n_cells].flatten(), + np.ones(n_genes * n_cells).reshape(-1, 1), + weights=np.repeat(stats["post"][:, i], n_cells), + exposure=base_nb_mean.flatten(), + ) res = model.fit(disp=0, maxiter=500) self.log_mu[i] = res.params[0] self.alphas[i] = res.params[-1] # self.alphas[i,:] = res.params[-1] else: all_states_nb_mean = np.tile(base_nb_mean.flatten(), self.n_components) - all_states_y = np.tile(stats['obs'][:, :n_cells].flatten(), self.n_components) - all_states_weights = np.concatenate([np.repeat(stats['post'][:,i], n_cells) for i in range(self.n_components)]) - all_states_features = np.zeros((self.n_components*n_genes*n_cells, self.n_components)) + all_states_y = np.tile( + stats["obs"][:, :n_cells].flatten(), self.n_components + ) + all_states_weights = np.concatenate( + [ + np.repeat(stats["post"][:, i], n_cells) + for i in range(self.n_components) + ] + ) + all_states_features = np.zeros( + (self.n_components * n_genes * n_cells, self.n_components) + ) for i in np.arange(self.n_components): - all_states_features[(i*n_genes*n_cells):((i+1)*n_genes*n_cells), i] = 1 - model = Weighted_NegativeBinomial(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_nb_mean) + all_states_features[ + (i * n_genes * n_cells) : ((i + 1) * n_genes * n_cells), i + ] = 1 + model = Weighted_NegativeBinomial( + all_states_y, + all_states_features, + weights=all_states_weights, + exposure=all_states_nb_mean, + ) res = model.fit(disp=0, maxiter=500) self.log_mu = res.params[:-1] self.alphas[:] = res.params[-1] # self.alphas[:,:] = res.params[-1] # print(res.params) - elif 'm' in self.params: + elif "m" in self.params: # NB regression fit CNV's effect only for i in range(self.n_components): - model = sm.GLM(stats['obs'].flatten(), np.ones(self.n_genes*self.n_cells).reshape(-1,1), \ - family=sm.families.NegativeBinomial(alpha=self.alphas[i]), \ - exposure=base_nb_mean.flatten()) + model = sm.GLM( + stats["obs"].flatten(), + np.ones(self.n_genes * self.n_cells).reshape(-1, 1), + family=sm.families.NegativeBinomial(alpha=self.alphas[i]), + exposure=base_nb_mean.flatten(), + ) # model = sm.GLM(stats['obs'][:, :n_cells].flatten(), np.ones(n_genes*n_cells).reshape(-1,1), \ # family=sm.families.NegativeBinomial(alpha=np.repeat(self.alphas[i], n_cells)), \ # exposure=base_nb_mean.flatten(), var_weights=np.repeat(stats['post'][:,i], n_cells)) res = model.fit(disp=0, maxiter=500) self.log_mu[i] = res.params[0] - if 't' in self.params: + if "t" in self.params: # following copied from Matt's code - denoms = stats['denoms'] + denoms = stats["denoms"] x = (self.transmat_.T * denoms).T # numerator is the sum of ii elements @@ -224,17 +275,18 @@ def _do_mstep(self, stats): denom = np.sum(x) # (this is the same as sum_i gamma_i) - #assert np.isclose(denom, np.sum(denoms)) + # assert np.isclose(denom, np.sum(denoms)) + + stats["diag"] = num / denom + self.transmat_ = self.form_transition_matrix(stats["diag"]) - stats['diag'] = num / denom - self.transmat_ = self.form_transition_matrix(stats['diag']) # def form_transition_matrix(self, diag): tol = 1e-10 diag = np.clip(diag, tol, 1 - tol) - + offdiag = (1 - diag) / (self.n_components - 1) - transmat_ = np.diag([diag - offdiag] * self.n_components) + transmat_ = np.diag([diag - offdiag] * self.n_components) transmat_ += offdiag - #assert np.all(transmat_ > 0), (diag, offdiag, transmat_) - return transmat_ \ No newline at end of file + # assert np.all(transmat_ > 0), (diag, offdiag, transmat_) + return transmat_ diff --git a/src/calicost/hmm_gaussian.py b/src/calicost/hmm_gaussian.py index b053610..0570a8f 100644 --- a/src/calicost/hmm_gaussian.py +++ b/src/calicost/hmm_gaussian.py @@ -14,6 +14,7 @@ # E step related ############################################################ + @njit def np_max_ax_squeeze(arr, axis=0): assert arr.ndim == 2 @@ -34,11 +35,11 @@ def np_max_ax_keep(arr, axis=0): assert arr.ndim == 2 assert axis in [0, 1] if axis == 0: - result = np.zeros( (1, arr.shape[1]) ) + result = np.zeros((1, arr.shape[1])) for i in range(result.shape[1]): result[:, i] = np.max(arr[:, i]) else: - result = np.zeros( (arr.shape[0], 1) ) + result = np.zeros((arr.shape[0], 1)) for i in range(result.shape[0]): result[i, :] = np.max(arr[i, :]) return result @@ -64,11 +65,11 @@ def np_sum_ax_keep(arr, axis=0): assert arr.ndim == 2 assert axis in [0, 1] if axis == 0: - result = np.zeros( (1, arr.shape[1]) ) + result = np.zeros((1, arr.shape[1])) for i in range(result.shape[1]): result[:, i] = np.sum(arr[:, i]) else: - result = np.zeros( (arr.shape[0], 1) ) + result = np.zeros((arr.shape[0], 1)) for i in range(result.shape[0]): result[i, :] = np.sum(arr[i, :]) return result @@ -88,26 +89,27 @@ def np_mean_ax_squeeze(arr, axis=0): result[i] = np.mean(arr[i, :]) return result + @njit def np_mean_ax_keep(arr, axis=0): assert arr.ndim == 2 assert axis in [0, 1] if axis == 0: - result = np.zeros( (1, arr.shape[1]) ) + result = np.zeros((1, arr.shape[1])) for i in range(result.shape[1]): result[:, i] = np.mean(arr[:, i]) else: - result = np.zeros( (arr.shape[0], 1) ) + result = np.zeros((arr.shape[0], 1)) for i in range(result.shape[0]): result[i, :] = np.mean(arr[i, :]) return result -@njit +@njit def mylogsumexp(a): # get max a_max = np.max(a) - if (np.isinf(a_max)): + if np.isinf(a_max): return a_max # exponential tmp = np.exp(a - a_max) @@ -117,7 +119,7 @@ def mylogsumexp(a): return s + a_max -@njit +@njit def mylogsumexp_ax_keep(a, axis): # get max a_max = np_max_ax_keep(a, axis=axis) @@ -133,7 +135,6 @@ def mylogsumexp_ax_keep(a, axis): return s + a_max - def compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std): """ Attributes @@ -158,7 +159,7 @@ def compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std): p_std : array, shape (n_states, n_spots) Over-dispersion of Beta Binomial distribution in HMM per state per spot. - + Returns ---------- log_emission : array, shape (2*n_states, n_obs, n_spots) @@ -173,19 +174,27 @@ def compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std): for i in np.arange(n_states): for s in np.arange(n_spots): # expression from Gaussian distribution - if np.any(X[:,0,s] > 0): - log_emission[i, :, s] = scipy.stats.norm.logpdf(X[:, 0, s], loc=rdr_mean[i,s], scale=rdr_std[i,s]) + if np.any(X[:, 0, s] > 0): + log_emission[i, :, s] = scipy.stats.norm.logpdf( + X[:, 0, s], loc=rdr_mean[i, s], scale=rdr_std[i, s] + ) log_emission[i + n_states, :, s] = log_emission[i, :, s] # BAF from Gaussian distribution - if np.any(X[:,1,s] > 0): - log_emission[i, :, s] += scipy.stats.norm.logpdf(X[:,1,s], loc=p_mean[i, s], scale=p_std[i,s]) - log_emission[i + n_states, :, s] += scipy.stats.norm.logpdf(X[:,1,s], loc=1-p_mean[i, s], scale=p_std[i,s]) + if np.any(X[:, 1, s] > 0): + log_emission[i, :, s] += scipy.stats.norm.logpdf( + X[:, 1, s], loc=p_mean[i, s], scale=p_std[i, s] + ) + log_emission[i + n_states, :, s] += scipy.stats.norm.logpdf( + X[:, 1, s], loc=1 - p_mean[i, s], scale=p_std[i, s] + ) return log_emission -@njit -def forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' +@njit +def forward_lattice_sitewise( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat +): + """ Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs. Input lengths: sum of lengths = n_observations. @@ -195,11 +204,15 @@ def forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat: n_observations, the log transition probability of phase switch. Output log_alpha: size 2n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = int(np.ceil(log_emission.shape[0] / 2)) - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat)) # initialize log_alpha log_alpha = np.zeros((log_emission.shape[0], n_obs)) @@ -207,25 +220,42 @@ def forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, cumlen = 0 for le in lengths: # start prob - combined_log_startprob = np.log(0.5) + np.append(log_startprob,log_startprob) - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + combined_log_startprob = np.log(0.5) + np.append(log_startprob, log_startprob) + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1) + log_alpha[:, cumlen] = combined_log_startprob + np_sum_ax_squeeze( + log_emission[:, cumlen, :], axis=1 + ) for t in np.arange(1, le): - phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t-1], log_sitewise_transmat[cumlen + t-1]], [log_sitewise_transmat[cumlen + t-1], log_sitewise_self_transmat[cumlen + t-1] ]]) - combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) ) + phases_switch_mat = np.array( + [ + [ + log_sitewise_self_transmat[cumlen + t - 1], + log_sitewise_transmat[cumlen + t - 1], + ], + [ + log_sitewise_transmat[cumlen + t - 1], + log_sitewise_self_transmat[cumlen + t - 1], + ], + ] + ) + combined_transmat = np.kron(np.exp(phases_switch_mat), np.exp(log_transmat)) combined_transmat = np.log(combined_transmat) for j in np.arange(log_emission.shape[0]): for i in np.arange(log_emission.shape[0]): buf[i] = log_alpha[i, (cumlen + t - 1)] + combined_transmat[i, j] - log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :]) + log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum( + log_emission[j, (cumlen + t), :] + ) cumlen += le return log_alpha -@njit -def backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): - ''' +@njit +def backward_lattice_sitewise( + lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat +): + """ Note that n_states is the CNV states, and there are 2 * n_states of paired states for (CNV, phasing) pairs. Input X: size n_observations * n_components * n_spots. @@ -236,11 +266,15 @@ def backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission log_sitewise_transmat: n_observations, the log transition probability of phase switch. Output log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda). - ''' + """ n_obs = log_emission.shape[1] n_states = int(np.ceil(log_emission.shape[0] / 2)) - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" - assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the first dimension of X!" + assert ( + len(log_startprob) == n_states + ), "Length of startprob_ must be equal to the first dimension of log_transmat!" log_sitewise_self_transmat = np.log(1 - np.exp(log_sitewise_transmat)) # initialize log_beta log_beta = np.zeros((log_emission.shape[0], n_obs)) @@ -248,29 +282,44 @@ def backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission cumlen = 0 for le in lengths: # start prob - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. log_beta[:, (cumlen + le - 1)] = 0 - for t in np.arange(le-2, -1, -1): - phases_switch_mat = np.array([[log_sitewise_self_transmat[cumlen + t], log_sitewise_transmat[cumlen + t]], [log_sitewise_transmat[cumlen + t], log_sitewise_self_transmat[cumlen + t] ]]) - combined_transmat = np.kron( np.exp(phases_switch_mat), np.exp(log_transmat) ) + for t in np.arange(le - 2, -1, -1): + phases_switch_mat = np.array( + [ + [ + log_sitewise_self_transmat[cumlen + t], + log_sitewise_transmat[cumlen + t], + ], + [ + log_sitewise_transmat[cumlen + t], + log_sitewise_self_transmat[cumlen + t], + ], + ] + ) + combined_transmat = np.kron(np.exp(phases_switch_mat), np.exp(log_transmat)) combined_transmat = np.log(combined_transmat) for i in np.arange(log_emission.shape[0]): for j in np.arange(log_emission.shape[0]): - buf[j] = log_beta[j, (cumlen + t + 1)] + combined_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :]) + buf[j] = ( + log_beta[j, (cumlen + t + 1)] + + combined_transmat[i, j] + + np.sum(log_emission[j, (cumlen + t + 1), :]) + ) log_beta[i, (cumlen + t)] = mylogsumexp(buf) cumlen += le return log_beta def compute_posterior_obs(log_alpha, log_beta): - ''' + """ Input log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda). log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda). Output: log_gamma: size n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). gamma[i, t] propto alpha[i,t] * beta[i,t] - ''' + """ n_states = log_alpha.shape[0] n_obs = log_alpha.shape[1] # initial log_gamma @@ -280,15 +329,17 @@ def compute_posterior_obs(log_alpha, log_beta): # for t in np.arange(n_obs): # log_gamma[j, t] = log_alpha[j, t] + log_beta[j, t] log_gamma = log_alpha + log_beta - if np.any( np.sum(log_gamma, axis=0) == 0 ): + if np.any(np.sum(log_gamma, axis=0) == 0): raise Exception("Sum of posterior probability is zero for some observations!") log_gamma -= scipy.special.logsumexp(log_gamma, axis=0) return log_gamma @njit -def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission): - ''' +def compute_posterior_transition_sitewise( + log_alpha, log_beta, log_transmat, log_emission +): + """ Input log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda). log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda). @@ -296,20 +347,28 @@ def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log log_emission: n_states * n_observations * n_spots. Log probability. Output: log_xi: size n_states * n_states * (n_observations-1). xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda) - ''' + """ n_states = int(log_alpha.shape[0] / 2) n_obs = log_alpha.shape[1] # initialize log_xi - log_xi = np.zeros((2*n_states, 2*n_states, n_obs-1)) + log_xi = np.zeros((2 * n_states, 2 * n_states, n_obs - 1)) # compute log_xi - for i in np.arange(2*n_states): - for j in np.arange(2*n_states): - for t in np.arange(n_obs-1): - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + for i in np.arange(2 * n_states): + for j in np.arange(2 * n_states): + for t in np.arange(n_obs - 1): + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_xi[i, j, t] = log_alpha[i, t] + log_transmat[i - n_states * int(i/n_states), j - n_states * int(j/n_states)] + np.sum(log_emission[j, t+1, :]) + log_beta[j, t+1] + log_xi[i, j, t] = ( + log_alpha[i, t] + + log_transmat[ + i - n_states * int(i / n_states), + j - n_states * int(j / n_states), + ] + + np.sum(log_emission[j, t + 1, :]) + + log_beta[j, t + 1] + ) # normalize - for t in np.arange(n_obs-1): + for t in np.arange(n_obs - 1): log_xi[:, :, t] -= mylogsumexp(log_xi[:, :, t]) return log_xi @@ -319,16 +378,18 @@ def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log ############################################################ @njit def update_startprob_sitewise(lengths, log_gamma): - ''' + """ Input lengths: sum of lengths = n_observations. log_gamma: size 2 * n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). Output log_startprob: n_states. Start probability after loog transformation. - ''' + """ n_states = int(log_gamma.shape[0] / 2) n_obs = log_gamma.shape[1] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the second dimension of log_gamma!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the second dimension of log_gamma!" # indices of the start of sequences, given that the length of each sequence is in lengths cumlen = 0 indices_start = [] @@ -341,7 +402,7 @@ def update_startprob_sitewise(lengths, log_gamma): # compute log_startprob of 2 * n_states log_startprob = mylogsumexp_ax_keep(log_gamma[:, indices_start], axis=1) # merge (CNV state, phase A) and (CNV state, phase B) - log_startprob = log_startprob.flatten().reshape(2,-1) + log_startprob = log_startprob.flatten().reshape(2, -1) log_startprob = mylogsumexp_ax_keep(log_startprob, axis=0) # normalize such that startprob sums to 1 log_startprob -= mylogsumexp(log_startprob) @@ -349,20 +410,28 @@ def update_startprob_sitewise(lengths, log_gamma): def update_transition_sitewise(log_xi, is_diag=False): - ''' + """ Input log_xi: size (2*n_states) * (2*n_states) * n_observations. xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda) Output log_transmat: n_states * n_states. Transition probability after log transformation. - ''' + """ n_states = int(log_xi.shape[0] / 2) n_obs = log_xi.shape[2] # initialize log_transmat log_transmat = np.zeros((n_states, n_states)) for i in np.arange(n_states): for j in np.arange(n_states): - log_transmat[i, j] = scipy.special.logsumexp( np.concatenate([log_xi[i, j, :], log_xi[i+n_states, j, :], \ - log_xi[i, j+n_states, :], log_xi[i + n_states, j + n_states, :]]) ) + log_transmat[i, j] = scipy.special.logsumexp( + np.concatenate( + [ + log_xi[i, j, :], + log_xi[i + n_states, j, :], + log_xi[i, j + n_states, :], + log_xi[i + n_states, j + n_states, :], + ] + ) + ) # row normalize log_transmat if not is_diag: for i in np.arange(n_states): @@ -372,7 +441,7 @@ def update_transition_sitewise(log_xi, is_diag=False): diagsum = scipy.special.logsumexp(np.diag(log_transmat)) totalsum = scipy.special.logsumexp(log_transmat) t = diagsum - totalsum - rest = np.log( (1 - np.exp(t)) / (n_states-1) ) + rest = np.log((1 - np.exp(t)) / (n_states - 1)) log_transmat = np.ones(log_transmat.shape) * rest np.fill_diagonal(log_transmat, t) return log_transmat @@ -384,7 +453,7 @@ def weighted_gaussian_fitting(x, weights): weights : 1d array """ mu = weights.dot(x) / np.sum(weights) - v = weights.dot( np.square(x - mu) ) / np.sum(weights) + v = weights.dot(np.square(x - mu)) / np.sum(weights) std = np.sqrt(v) return mu, std @@ -399,15 +468,16 @@ def weighted_gaussian_fitting_sharestd(X, Weights): mus = np.zeros(n_clusters) ssr = np.zeros(X.shape) for i in range(n_clusters): - mus[i] = Weights[:,i].dot(X[:,i]) / np.sum(Weights[:,i]) - ssr[:,i] = np.square(X[:,i] - mus[i]) + mus[i] = Weights[:, i].dot(X[:, i]) / np.sum(Weights[:, i]) + ssr[:, i] = np.square(X[:, i] - mus[i]) v = Weights.flatten().dot(ssr.flatten()) / np.sum(Weights) stds = np.ones(n_clusters) * np.sqrt(v) return mus, stds -def update_emission_params_rdr_sitewise(X_rdr, log_gamma, rdr_std, \ - start_rdr_mean=None, shared_rdr_std=False): +def update_emission_params_rdr_sitewise( + X_rdr, log_gamma, rdr_std, start_rdr_mean=None, shared_rdr_std=False +): """ Attributes ---------- @@ -423,26 +493,41 @@ def update_emission_params_rdr_sitewise(X_rdr, log_gamma, rdr_std, \ n_spots = X_rdr.shape[1] n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) - new_rdr_mean = copy.copy(start_rdr_mean) if not start_rdr_mean is None else np.ones((n_states, n_spots)) + new_rdr_mean = ( + copy.copy(start_rdr_mean) + if not start_rdr_mean is None + else np.ones((n_states, n_spots)) + ) new_rdr_std = copy.copy(rdr_std) # expression signal by NB distribution if not shared_rdr_std: for s in range(n_spots): for i in range(n_states): - mu, std = weighted_gaussian_fitting( X_rdr[:,s], gamma[i,:]+gamma[i+n_states,:] ) + mu, std = weighted_gaussian_fitting( + X_rdr[:, s], gamma[i, :] + gamma[i + n_states, :] + ) new_rdr_mean[i, s] = mu - new_rdr_std[i,s] = std + new_rdr_std[i, s] = std else: for s in range(n_spots): - mus, stds = weighted_gaussian_fitting_sharestd( np.vstack([ X_rdr[:,s] for i in range(n_states) ]).T, \ - (gamma[:n_states, :] + gamma[n_states:, :]).T ) - new_rdr_mean[:,s] = mus - new_rdr_std[:,s] = stds + mus, stds = weighted_gaussian_fitting_sharestd( + np.vstack([X_rdr[:, s] for i in range(n_states)]).T, + (gamma[:n_states, :] + gamma[n_states:, :]).T, + ) + new_rdr_mean[:, s] = mus + new_rdr_std[:, s] = stds return new_rdr_mean, new_rdr_std -def update_emission_params_baf_sitewise(X_baf, log_gamma, p_std, \ - start_p_mean=None, shared_p_std=False, min_binom_prob=0.01, max_binom_prob=0.99): +def update_emission_params_baf_sitewise( + X_baf, + log_gamma, + p_std, + start_p_mean=None, + shared_p_std=False, + min_binom_prob=0.01, + max_binom_prob=0.99, +): """ Attributes ---------- @@ -459,23 +544,32 @@ def update_emission_params_baf_sitewise(X_baf, log_gamma, p_std, \ n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) # initialization - new_p_mean = copy.copy(start_p_mean) if not start_p_mean is None else np.ones((n_states, n_spots)) * 0.5 + new_p_mean = ( + copy.copy(start_p_mean) + if not start_p_mean is None + else np.ones((n_states, n_spots)) * 0.5 + ) new_p_std = copy.copy(p_std) if not shared_p_std: for s in np.arange(X_baf.shape[1]): for i in range(n_states): - mu, std = weighted_gaussian_fitting( np.append(X_baf[:,s], 1-X_baf[:,s]), np.append(gamma[i,:], gamma[i+n_states,:]) ) + mu, std = weighted_gaussian_fitting( + np.append(X_baf[:, s], 1 - X_baf[:, s]), + np.append(gamma[i, :], gamma[i + n_states, :]), + ) new_p_mean[i, s] = mu new_p_std[i, s] = std else: for s in np.arange(X_baf.shape[1]): - concat_X_baf = np.append(X_baf[:,s], 1-X_baf[:,s]) - concat_gamma = np.hstack([gamma[:n_states,:], gamma[n_states:, :]]) - mus, stds = weighted_gaussian_fitting_sharestd( np.vstack([ concat_X_baf for i in range(n_states) ]).T, concat_gamma.T) - new_p_mean[:,s] = mus - new_p_std[:,s] = stds - new_p_mean[new_p_mean[:,s] < min_binom_prob, s] = min_binom_prob - new_p_mean[new_p_mean[:,s] > max_binom_prob, s] = max_binom_prob + concat_X_baf = np.append(X_baf[:, s], 1 - X_baf[:, s]) + concat_gamma = np.hstack([gamma[:n_states, :], gamma[n_states:, :]]) + mus, stds = weighted_gaussian_fitting_sharestd( + np.vstack([concat_X_baf for i in range(n_states)]).T, concat_gamma.T + ) + new_p_mean[:, s] = mus + new_p_std[:, s] = stds + new_p_mean[new_p_mean[:, s] < min_binom_prob, s] = min_binom_prob + new_p_mean[new_p_mean[:, s] > max_binom_prob, s] = max_binom_prob return new_p_mean, new_p_std @@ -483,8 +577,9 @@ def update_emission_params_baf_sitewise(X_baf, log_gamma, p_std, \ # whole inference ############################################################ + class hmm_gaussian_sitewise(object): - def __init__(self, params="stmp", t=1-1e-4): + def __init__(self, params="stmp", t=1 - 1e-4): """ Attributes ---------- @@ -496,11 +591,25 @@ def __init__(self, params="stmp", t=1-1e-4): """ self.params = params self.t = t + # - def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_transmat, \ - shared_rdr_std=False, shared_p_std=False, \ - is_diag=False, init_rdr_mean=None, init_p_mean=None, init_rdr_std=None, init_p_std=None, max_iter=100, tol=1e-4): - ''' + def run_baum_welch_nb_bb_sitewise( + self, + X, + lengths, + n_states, + log_sitewise_transmat, + shared_rdr_std=False, + shared_p_std=False, + is_diag=False, + init_rdr_mean=None, + init_p_mean=None, + init_rdr_std=None, + init_p_std=None, + max_iter=100, + tol=1e-4, + ): + """ Input X: size n_observations * n_components * n_spots. lengths: sum of lengths = n_observations. @@ -509,33 +618,59 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans Intermediate log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. alpha: size of n_states. Dispersioon parameter of each HMM state. - ''' + """ n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 # initialize NB logmean shift and BetaBinom prob - rdr_mean = np.vstack([np.linspace(0.5, 3, n_states) for r in range(n_spots)]).T if init_rdr_mean is None else init_rdr_mean - p_mean = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_mean is None else init_p_mean + rdr_mean = ( + np.vstack([np.linspace(0.5, 3, n_states) for r in range(n_spots)]).T + if init_rdr_mean is None + else init_rdr_mean + ) + p_mean = ( + np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T + if init_p_mean is None + else init_p_mean + ) # initialize (inverse of) dispersion param in NB and BetaBinom - rdr_std = 0.5 * np.ones((n_states, n_spots)) if init_rdr_std is None else init_rdr_std + rdr_std = ( + 0.5 * np.ones((n_states, n_spots)) if init_rdr_std is None else init_rdr_std + ) p_std = 0.1 * np.ones((n_states, n_spots)) if init_p_std is None else init_p_std # initialize start probability and emission probability - log_startprob = np.log( np.ones(n_states) / n_states ) + log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: - transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1) + transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: - log_transmat = np.zeros((1,1)) + log_transmat = np.zeros((1, 1)) # EM algorithm for r in trange(max_iter): # E step - log_emission = compute_emission_probability_gaussian(X, rdr_mean, rdr_std, p_mean, p_std) - log_alpha = forward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) - log_beta = backward_lattice_sitewise(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) + log_emission = compute_emission_probability_gaussian( + X, rdr_mean, rdr_std, p_mean, p_std + ) + log_alpha = forward_lattice_sitewise( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) + log_beta = backward_lattice_sitewise( + lengths, + log_transmat, + log_startprob, + log_emission, + log_sitewise_transmat, + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission) + log_xi = compute_posterior_transition_sitewise( + log_alpha, log_beta, log_transmat, log_emission + ) # M step if "s" in self.params: new_log_startprob = update_startprob_sitewise(lengths, log_gamma) @@ -547,24 +682,40 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans else: new_log_transmat = log_transmat if "m" in self.params: - new_rdr_mean, new_rdr_std = update_emission_params_rdr_sitewise(X[:,0,:], log_gamma, rdr_std, start_rdr_mean=rdr_mean, shared_rdr_std=shared_rdr_std) + new_rdr_mean, new_rdr_std = update_emission_params_rdr_sitewise( + X[:, 0, :], + log_gamma, + rdr_std, + start_rdr_mean=rdr_mean, + shared_rdr_std=shared_rdr_std, + ) else: new_rdr_mean = rdr_mean new_rdr_std = rdr_std if "p" in self.params: - new_p_mean, new_p_std = update_emission_params_baf_sitewise(X[:,1,:], log_gamma, p_std, start_p_mean=p_mean, \ - shared_p_std=shared_p_std) + new_p_mean, new_p_std = update_emission_params_baf_sitewise( + X[:, 1, :], + log_gamma, + p_std, + start_p_mean=p_mean, + shared_p_std=shared_p_std, + ) else: new_p_mean = p_mean new_p_std = p_std # check convergence - print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \ - np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \ - np.mean(np.abs(new_rdr_mean - rdr_mean)),\ - np.mean(np.abs(new_p_mean - p_mean)) ) - print( np.hstack([new_rdr_mean, new_p_mean]) ) - if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \ - np.mean(np.abs(new_rdr_mean - rdr_mean)) < tol and np.mean(np.abs(new_p_mean - p_mean)) < tol: + print( + np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), + np.mean(np.abs(new_rdr_mean - rdr_mean)), + np.mean(np.abs(new_p_mean - p_mean)), + ) + print(np.hstack([new_rdr_mean, new_p_mean])) + if ( + np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol + and np.mean(np.abs(new_rdr_mean - rdr_mean)) < tol + and np.mean(np.abs(new_p_mean - p_mean)) < tol + ): break log_startprob = new_log_startprob log_transmat = new_log_transmat @@ -572,7 +723,14 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans rdr_std = new_rdr_std p_mean = new_p_mean p_std = new_p_std - return new_rdr_mean, new_rdr_std, new_p_mean, new_p_std, new_log_startprob, new_log_transmat + return ( + new_rdr_mean, + new_rdr_std, + new_p_mean, + new_p_std, + new_log_startprob, + new_log_transmat, + ) # def posterior_nb_bb_sitewise(X, lengths, rdr_mean, rdr_std, p_mean, p_std, log_startprob, log_transmat, log_sitewise_transmat): @@ -705,7 +863,7 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans # log_v[i, t] = np.max(tmp) # # backtracking to get the sequence # chr_labels = [ np.argmax(log_v[:,-1]) ] - + # if cumlen == 0: # for t2 in np.arange(le-1, 0, -1): # chr_labels.append( int(bt[chr_labels[-1],t2])) @@ -717,51 +875,57 @@ def run_baum_welch_nb_bb_sitewise(self, X, lengths, n_states, log_sitewise_trans # # merge two phases # chr_merged_labels = copy.copy(chr_labels) # chr_merged_labels[chr_merged_labels >= n_states] = chr_merged_labels[chr_merged_labels >= n_states] - n_states - + # if cumlen == 0: # labels = chr_labels # merged_labels = chr_merged_labels # else: # labels = np.append(labels, chr_labels) # merged_labels = np.append(merged_labels, chr_merged_labels) - + # cumlen += le # return labels, merged_labels from sklearn.mixture import GaussianMixture -def initialization_gaussianhmm_by_gmm(n_states, X, params, random_state=None, min_binom_prob=0.1, max_binom_prob=0.9): + + +def initialization_gaussianhmm_by_gmm( + n_states, X, params, random_state=None, min_binom_prob=0.1, max_binom_prob=0.9 +): # prepare gmm input of RDR and BAF separately X_gmm_rdr = None X_gmm_baf = None if "m" in params: - X_gmm_rdr = np.vstack([ X[:,0,s] for s in range(X.shape[2]) ]).T + X_gmm_rdr = np.vstack([X[:, 0, s] for s in range(X.shape[2])]).T if "p" in params: - X_gmm_baf = np.vstack([ X[:,1,s] for s in range(X.shape[2]) ]).T + X_gmm_baf = np.vstack([X[:, 1, s] for s in range(X.shape[2])]).T X_gmm_baf[X_gmm_baf < min_binom_prob] = min_binom_prob X_gmm_baf[X_gmm_baf > max_binom_prob] = max_binom_prob # combine RDR and BAF if ("m" in params) and ("p" in params): - indexes = np.where(X_gmm_baf[:,0] > 0.5)[0] - X_gmm_baf[indexes,:] = 1 - X_gmm_baf[indexes,:] + indexes = np.where(X_gmm_baf[:, 0] > 0.5)[0] + X_gmm_baf[indexes, :] = 1 - X_gmm_baf[indexes, :] X_gmm = np.hstack([X_gmm_rdr, X_gmm_baf]) elif "m" in params: X_gmm = X_gmm_rdr elif "p" in params: - indexes = np.where(X_gmm_baf[:,0] > 0.5)[0] - X_gmm_baf[indexes,:] = 1 - X_gmm_baf[indexes,:] + indexes = np.where(X_gmm_baf[:, 0] > 0.5)[0] + X_gmm_baf[indexes, :] = 1 - X_gmm_baf[indexes, :] X_gmm = X_gmm_baf assert not np.any(np.isnan(X_gmm)) # run GMM if random_state is None: gmm = GaussianMixture(n_components=n_states, max_iter=1).fit(X_gmm) else: - gmm = GaussianMixture(n_components=n_states, max_iter=1, random_state=random_state).fit(X_gmm) + gmm = GaussianMixture( + n_components=n_states, max_iter=1, random_state=random_state + ).fit(X_gmm) # turn gmm fitted parameters to HMM rdr_mean and p_mean parameters if ("m" in params) and ("p" in params): - gmm_rdr_mean = gmm.means_[:,:X.shape[2]] - gmm_p_mean = gmm.means_[:, X.shape[2]:] + gmm_rdr_mean = gmm.means_[:, : X.shape[2]] + gmm_p_mean = gmm.means_[:, X.shape[2] :] elif "m" in params: gmm_rdr_mean = gmm.means_ gmm_p_mean = None @@ -771,37 +935,97 @@ def initialization_gaussianhmm_by_gmm(n_states, X, params, random_state=None, mi return gmm_rdr_mean, gmm_p_mean -def pipeline_gaussian_baum_welch(X, lengths, n_states, log_sitewise_transmat, params="smp", t=1-1e-6, random_state=0, \ - shared_rdr_std=True, shared_p_std=True, init_rdr_mean=None, init_p_mean=None, init_rdr_std=None, init_p_std=None, \ - is_diag=True, max_iter=100, tol=1e-4): +def pipeline_gaussian_baum_welch( + X, + lengths, + n_states, + log_sitewise_transmat, + params="smp", + t=1 - 1e-6, + random_state=0, + shared_rdr_std=True, + shared_p_std=True, + init_rdr_mean=None, + init_p_mean=None, + init_rdr_std=None, + init_p_std=None, + is_diag=True, + max_iter=100, + tol=1e-4, +): # initialization n_spots = X.shape[2] - if ((init_rdr_mean is None) and ("m" in params)) or ((init_p_mean is None) and ("p" in params)): - tmp_rdr_mean, tmp_p_mean = initialization_gaussianhmm_by_gmm(n_states, X, params, random_state=random_state) + if ((init_rdr_mean is None) and ("m" in params)) or ( + (init_p_mean is None) and ("p" in params) + ): + tmp_rdr_mean, tmp_p_mean = initialization_gaussianhmm_by_gmm( + n_states, X, params, random_state=random_state + ) if (init_rdr_mean is None) and ("m" in params): init_rdr_mean = tmp_rdr_mean if (init_p_mean is None) and ("p" in params): init_p_mean = tmp_p_mean print(f"init_log_mu = {init_rdr_mean}") print(f"init_p_mean = {init_p_mean}") - + # fit HMM-NB-BetaBinom hmmmodel = hmm_gaussian_sitewise(params=params, t=t) - new_rdr_mean, new_rdr_std, new_p_mean, new_p_std, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb_sitewise(X, lengths, \ - n_states, log_sitewise_transmat, shared_rdr_std=shared_rdr_std, shared_p_std=shared_p_std, is_diag=is_diag, \ - init_rdr_mean=init_rdr_mean, init_p_mean=init_p_mean, init_rdr_std=init_rdr_std, init_p_std=init_p_std, max_iter=max_iter, tol=tol) - + ( + new_rdr_mean, + new_rdr_std, + new_p_mean, + new_p_std, + new_log_startprob, + new_log_transmat, + ) = hmmmodel.run_baum_welch_nb_bb_sitewise( + X, + lengths, + n_states, + log_sitewise_transmat, + shared_rdr_std=shared_rdr_std, + shared_p_std=shared_p_std, + is_diag=is_diag, + init_rdr_mean=init_rdr_mean, + init_p_mean=init_p_mean, + init_rdr_std=init_rdr_std, + init_p_std=init_p_std, + max_iter=max_iter, + tol=tol, + ) + # likelihood, posterior and prediction - log_emission = compute_emission_probability_gaussian(X, new_rdr_mean, new_rdr_std, new_p_mean, new_p_std) - log_alpha = forward_lattice_sitewise(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat) - log_beta = backward_lattice_sitewise(lengths, new_log_transmat, new_log_startprob, log_emission, log_sitewise_transmat) + log_emission = compute_emission_probability_gaussian( + X, new_rdr_mean, new_rdr_std, new_p_mean, new_p_std + ) + log_alpha = forward_lattice_sitewise( + lengths, + new_log_transmat, + new_log_startprob, + log_emission, + log_sitewise_transmat, + ) + log_beta = backward_lattice_sitewise( + lengths, + new_log_transmat, + new_log_startprob, + log_emission, + log_sitewise_transmat, + ) log_gamma = compute_posterior_obs(log_alpha, log_beta) pred = np.argmax(log_gamma, axis=0) pred_cnv = pred % n_states - llf = np.sum(scipy.special.logsumexp(log_alpha[:,np.cumsum(lengths)-1], axis=0)) + llf = np.sum(scipy.special.logsumexp(log_alpha[:, np.cumsum(lengths) - 1], axis=0)) # save results - res = {"new_rdr_mean":new_rdr_mean, "new_rdr_std":new_rdr_std, "new_p_mean":new_p_mean, "new_p_std":new_p_std, \ - "new_log_startprob":new_log_startprob, "new_log_transmat":new_log_transmat, "log_gamma":log_gamma, "pred_cnv":pred_cnv, "llf":llf} + res = { + "new_rdr_mean": new_rdr_mean, + "new_rdr_std": new_rdr_std, + "new_p_mean": new_p_mean, + "new_p_std": new_p_std, + "new_log_startprob": new_log_startprob, + "new_log_transmat": new_log_transmat, + "log_gamma": log_gamma, + "pred_cnv": pred_cnv, + "llf": llf, + } return res - diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index e8e862f..ccc8f0c 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -26,7 +26,21 @@ # Pure clone ############################################################ -def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): + +def hmrf_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): """ Choosing clones by Iterated Conditional Modes (Forward-backward version): for which the emission probability is given by the posterior probability of all HMM states at each bin. @@ -40,47 +54,103 @@ def hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_R n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] n_states = res["new_p_binom"].shape[0] - single_llf = np.zeros((N, n_clones)) # node potential + single_llf = np.zeros((N, n_clones)) # node potential new_assignment = copy.copy(prev_assignment) # posterior = np.zeros((N, n_clones)) for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] for c in range(n_clones): - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)]) - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"][:, c : (c + 1)], + res["new_alphas"][:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"][:, c : (c + 1)], + res["new_taus"][:, c : (c + 1)], + ) + ) + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) + / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + single_llf[i, c] = ratio_nonzeros * np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) else: - single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) - - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: if new_assignment[j] >= 0: - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): +def aggr_hmrf_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + pred, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): """ Choosing clones by Iterated Conditional Modes (Viterbi version): for which the emission probability of each spot is a single of HMM state sequence. @@ -97,40 +167,81 @@ def aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, re posterior = np.zeros((N, n_clones)) for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] # idx = np.append(idx, np.array([i])) for c in range(n_clones): - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)]) - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"][:, c : (c + 1)], + res["new_alphas"][:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"][:, c : (c + 1)], + res["new_taus"][:, c : (c + 1)], + ) + ) + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, idx] > 0) + / np.sum(single_base_nb_mean[:, idx] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) + single_llf[i, c] = ratio_nonzeros * np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]) else: - single_llf[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) - - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]) + + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: if new_assignment[j] >= 0: - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): +def hmrf_reassignment_posterior_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): """ Input format assumption: the RDR/BAF vector is shared across all clones <- using only BAF signals, or running for each initial clone """ @@ -144,39 +255,99 @@ def hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, singl posterior = np.zeros((N, n_clones)) for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"]) + idx = smooth_mat[i, :].nonzero()[1] + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"], + res["new_alphas"], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"], + res["new_taus"], + ) + ) for c in range(n_clones): - if np.sum(single_base_nb_mean[:,i:(i+1)] > 0) > 0 and np.sum(single_total_bb_RD[:,i:(i+1)] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0) + if ( + np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) > 0 + and np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) + / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + single_llf[i, c] = ratio_nonzeros * np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) else: - single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + for j in adjacency_mat[i, :].nonzero()[1]: + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def aggr_hmrf_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): +def aggr_hmrf_reassignment_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + pred, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): """ HMRF assign spots to tumor clones. @@ -230,43 +401,79 @@ def aggr_hmrf_reassignment_concatenate(single_X, single_base_nb_mean, single_tot posterior = np.zeros((N, n_clones)) for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] # idx = np.append(idx, np.array([i])) - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"]) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"], + res["new_alphas"], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"], + res["new_taus"], + ) + ) for c in range(n_clones): - this_pred = pred[(c*n_obs):(c*n_obs+n_obs)] - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0) + this_pred = pred[(c * n_obs) : (c * n_obs + n_obs)] + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, idx] > 0) + / np.sum(single_base_nb_mean[:, idx] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) + single_llf[i, c] = ratio_nonzeros * np.sum( + tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) else: - single_llf[i,c] = np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] # new_assignment[i] = np.argmax( w_node ) w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + for j in adjacency_mat[i, :].nonzero()[1]: + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds=50, min_umicount_thresholds=0, single_tumor_prop=None, threshold=0.5): +def merge_by_minspots( + assignment, + res, + single_total_bb_RD, + min_spots_thresholds=50, + min_umicount_thresholds=0, + single_tumor_prop=None, + threshold=0.5, +): n_clones = len(np.unique(assignment)) if n_clones == 1: - merged_groups = [ [assignment[0]] ] + merged_groups = [[assignment[0]]] return merged_groups, res n_obs = int(len(res["pred_cnv"]) / n_clones) @@ -277,19 +484,44 @@ def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds= tmp_single_tumor_prop = single_tumor_prop unique_assignment = np.unique(new_assignment) # find entries in unique_assignment such that either min_spots_thresholds or min_umicount_thresholds is not satisfied - failed_clones = [ c for c in unique_assignment if (np.sum(new_assignment[tmp_single_tumor_prop > threshold] == c) < min_spots_thresholds) or \ - (np.sum(single_total_bb_RD[:, (new_assignment == c)&(tmp_single_tumor_prop > threshold)]) < min_umicount_thresholds) ] + failed_clones = [ + c + for c in unique_assignment + if ( + np.sum(new_assignment[tmp_single_tumor_prop > threshold] == c) + < min_spots_thresholds + ) + or ( + np.sum( + single_total_bb_RD[ + :, (new_assignment == c) & (tmp_single_tumor_prop > threshold) + ] + ) + < min_umicount_thresholds + ) + ] # find the remaining unique_assigment that satisfies both thresholds - successful_clones = [ c for c in unique_assignment if not c in failed_clones ] + successful_clones = [c for c in unique_assignment if not c in failed_clones] # initial merging groups: each successful clone is its own group merging_groups = [[i] for i in successful_clones] # for each failed clone, assign them to the closest successful clone if len(failed_clones) > 0: for c in failed_clones: - idx_max = np.argmax([np.sum(single_total_bb_RD[:, (new_assignment == c_prime)&(tmp_single_tumor_prop > threshold)]) for c_prime in successful_clones]) + idx_max = np.argmax( + [ + np.sum( + single_total_bb_RD[ + :, + (new_assignment == c_prime) + & (tmp_single_tumor_prop > threshold), + ] + ) + for c_prime in successful_clones + ] + ) merging_groups[idx_max].append(c) map_clone_id = {} - for i,x in enumerate(merging_groups): + for i, x in enumerate(merging_groups): for z in x: map_clone_id[z] = i new_assignment = np.array([map_clone_id[x] for x in new_assignment]) @@ -309,16 +541,55 @@ def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds= merged_res = copy.copy(res) merged_res["new_assignment"] = new_assignment merged_res["total_llf"] = np.NAN - merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) - merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) + merged_res["pred_cnv"] = np.concatenate( + [ + res["pred_cnv"][(c[0] * n_obs) : (c[0] * n_obs + n_obs)] + for c in merging_groups + ] + ) + merged_res["log_gamma"] = np.hstack( + [ + res["log_gamma"][:, (c[0] * n_obs) : (c[0] * n_obs + n_obs)] + for c in merging_groups + ] + ) return merging_groups, merged_res -def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states, \ - log_sitewise_transmat, coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", \ - hmmclass=hmm_sitewise, params="stmp", t=1-1e-6, random_state=0, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\ - fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \ - is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0): +def hmrf_pipeline( + outdir, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + n_states, + log_sitewise_transmat, + coords=None, + smooth_mat=None, + adjacency_mat=None, + sample_ids=None, + max_iter_outer=5, + nodepotential="max", + hmmclass=hmm_sitewise, + params="stmp", + t=1 - 1e-6, + random_state=0, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + fix_NB_dispersion=False, + shared_NB_dispersion=True, + fix_BB_dispersion=False, + shared_BB_dispersion=True, + is_diag=True, + max_iter=100, + tol=1e-4, + unit_xsquared=9, + unit_ysquared=3, + spatial_weight=1.0, +): n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) # spot adjacency matric @@ -331,14 +602,25 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b else: unique_sample_ids = np.unique(sample_ids) n_samples = len(unique_sample_ids) - tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))} - sample_ids = np.array([ tmp_map_index[x] for x in sample_ids]) + tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} + sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) # pseudobulk - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index + ) # initialize HMM parameters by GMM if (init_log_mu is None) or (init_p_binom is None): - init_log_mu, init_p_binom = initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=random_state, in_log_space=False, only_minor=False) + init_log_mu, init_p_binom = initialization_by_gmm( + n_states, + X, + base_nb_mean, + total_bb_RD, + params, + random_state=random_state, + in_log_space=False, + only_minor=False, + ) # initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu @@ -352,32 +634,74 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b last_alphas = init_alphas last_taus = init_taus last_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c # HMM for r in range(max_iter_outer): if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists(): ##### initialize with the parameters of last iteration ##### - res = pipeline_baum_welch(None, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, \ - hmmclass=hmmclass, params=params, t=t, random_state=random_state, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ - is_diag=is_diag, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=max_iter, tol=tol) + res = pipeline_baum_welch( + None, + X, + lengths, + n_states, + base_nb_mean, + total_bb_RD, + log_sitewise_transmat, + hmmclass=hmmclass, + params=params, + t=t, + random_state=random_state, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=is_diag, + init_log_mu=last_log_mu, + init_p_binom=last_p_binom, + init_alphas=last_alphas, + init_taus=last_taus, + max_iter=max_iter, + tol=tol, + ) pred = np.argmax(res["log_gamma"], axis=0) # clone assignmment if nodepotential == "max": - new_assignment, single_llf, total_llf = aggr_hmrf_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = aggr_hmrf_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + pred, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) elif nodepotential == "weighted_sum": - new_assignment, single_llf, total_llf = hmrf_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, res, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = hmrf_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) else: raise Exception("Unknown mode for nodepotential!") # handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) - re_indexing = {c:i for i,c in enumerate(remaining_clones)} + re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) # res["prev_assignment"] = last_assignment @@ -391,18 +715,49 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz") # regroup to pseudobulk - clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))] - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index) + clone_index = [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ] + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index + ) # update last parameter if "mp" in params: - print("outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) + print( + "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( + r, + res["total_llf"], + np.mean(np.abs(last_log_mu - res["new_log_mu"])), + np.mean(np.abs(last_p_binom - res["new_p_binom"])), + ) + ) elif "m" in params: - print("outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])) )) + print( + "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( + r, + res["total_llf"], + np.mean(np.abs(last_log_mu - res["new_log_mu"])), + ) + ) elif "p" in params: - print("outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) - print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) )) - if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1: + print( + "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( + r, + res["total_llf"], + np.mean(np.abs(last_p_binom - res["new_p_binom"])), + ) + ) + print( + "outer iteration {}: ARI between assignment = {}".format( + r, adjusted_rand_score(last_assignment, res["new_assignment"]) + ) + ) + if ( + adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 + or len(np.unique(res["new_assignment"])) == 1 + ): break last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] @@ -412,16 +767,52 @@ def hmrf_pipeline(outdir, single_X, lengths, single_base_nb_mean, single_total_b log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] - this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index) - log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) - log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) - - -def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states, \ - log_sitewise_transmat, coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", hmmclass=hmm_sitewise, \ - params="stmp", t=1-1e-6, random_state=0, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\ - fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \ - is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0): + this_persample_weight = np.bincount( + res["new_assignment"][index], minlength=X.shape[2] + ) / len(index) + log_persample_weights[:, sidx] = np.where( + this_persample_weight > 0, np.log(this_persample_weight), -50 + ) + log_persample_weights[:, sidx] = log_persample_weights[ + :, sidx + ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) + + +def hmrf_concatenate_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + n_states, + log_sitewise_transmat, + coords=None, + smooth_mat=None, + adjacency_mat=None, + sample_ids=None, + max_iter_outer=5, + nodepotential="max", + hmmclass=hmm_sitewise, + params="stmp", + t=1 - 1e-6, + random_state=0, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + fix_NB_dispersion=False, + shared_NB_dispersion=True, + fix_BB_dispersion=False, + shared_BB_dispersion=True, + is_diag=True, + max_iter=100, + tol=1e-4, + unit_xsquared=9, + unit_ysquared=3, + spatial_weight=1.0, +): n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) # checking input @@ -434,15 +825,27 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_ else: unique_sample_ids = np.unique(sample_ids) n_samples = len(unique_sample_ids) - tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))} - sample_ids = np.array([ tmp_map_index[x] for x in sample_ids]) + tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} + sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) # pseudobulk - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index + ) # initialize HMM parameters by GMM if (init_log_mu is None) or (init_p_binom is None): - init_log_mu, init_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params, random_state=random_state, in_log_space=False, only_minor=False) + init_log_mu, init_p_binom = initialization_by_gmm( + n_states, + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + params, + random_state=random_state, + in_log_space=False, + only_minor=False, + ) # initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu @@ -456,51 +859,112 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_ last_alphas = init_alphas last_taus = init_taus last_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c # HMM for r in range(max_iter_outer): # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization - allres = np.load(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True) + allres = np.load( + f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True + ) allres = dict(allres) if allres["num_iterations"] > r: - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} - else: - res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), \ - hmmclass=hmmclass, params=params, t=t, random_state=random_state, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ - is_diag=is_diag, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=max_iter, tol=tol) + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } + else: + res = pipeline_baum_welch( + None, + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + np.tile(lengths, X.shape[2]), + n_states, + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + np.tile(log_sitewise_transmat, X.shape[2]), + hmmclass=hmmclass, + params=params, + t=t, + random_state=random_state, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=is_diag, + init_log_mu=last_log_mu, + init_p_binom=last_p_binom, + init_alphas=last_alphas, + init_taus=last_taus, + max_iter=max_iter, + tol=tol, + ) pred = np.argmax(res["log_gamma"], axis=0) # HMRF clone assignmment if nodepotential == "max": - new_assignment, single_llf, total_llf = aggr_hmrf_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = ( + aggr_hmrf_reassignment_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + pred, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) + ) elif nodepotential == "weighted_sum": - new_assignment, single_llf, total_llf = hmrf_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, res, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = ( + hmrf_reassignment_posterior_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) + ) else: raise Exception("Unknown mode for nodepotential!") # handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) - re_indexing = {c:i for i,c in enumerate(remaining_clones)} + re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) - concat_idx = np.concatenate([ np.arange(c*n_obs, c*n_obs+n_obs) for c in remaining_clones ]) - res["log_gamma"] = res["log_gamma"][:,concat_idx] + concat_idx = np.concatenate( + [np.arange(c * n_obs, c * n_obs + n_obs) for c in remaining_clones] + ) + res["log_gamma"] = res["log_gamma"][:, concat_idx] res["pred_cnv"] = res["pred_cnv"][concat_idx] # res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf # append to allres - for k,v in res.items(): + for k, v in res.items(): if k == "prev_assignment": allres[f"round{r-1}_assignment"] = v elif k == "new_assignment": @@ -511,18 +975,44 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_ np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) # # regroup to pseudobulk - clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))] - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index) + clone_index = [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ] + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index + ) # if "mp" in params: - print("outer iteration {}: difference between parameters = {}, {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) + print( + "outer iteration {}: difference between parameters = {}, {}".format( + r, + np.mean(np.abs(last_log_mu - res["new_log_mu"])), + np.mean(np.abs(last_p_binom - res["new_p_binom"])), + ) + ) elif "m" in params: - print("outer iteration {}: difference between NB parameters = {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])) )) + print( + "outer iteration {}: difference between NB parameters = {}".format( + r, np.mean(np.abs(last_log_mu - res["new_log_mu"])) + ) + ) elif "p" in params: - print("outer iteration {}: difference between BetaBinom parameters = {}".format( r, np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) - print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) )) + print( + "outer iteration {}: difference between BetaBinom parameters = {}".format( + r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) + ) + ) + print( + "outer iteration {}: ARI between assignment = {}".format( + r, adjusted_rand_score(last_assignment, res["new_assignment"]) + ) + ) # if np.all( last_assignment == res["new_assignment"] ): - if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1: + if ( + adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 + or len(np.unique(res["new_assignment"])) == 1 + ): break last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] @@ -532,17 +1022,38 @@ def hmrf_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_ log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] - this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index) - log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) - log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) - + this_persample_weight = np.bincount( + res["new_assignment"][index], minlength=X.shape[2] + ) / len(index) + log_persample_weights[:, sidx] = np.where( + this_persample_weight > 0, np.log(this_persample_weight), -50 + ) + log_persample_weights[:, sidx] = log_persample_weights[ + :, sidx + ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) ############################################################ # Normal-tumor clone mixture ############################################################ -def aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): + +def aggr_hmrfmix_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + pred, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] @@ -555,46 +1066,98 @@ def aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, posterior = np.zeros((N, n_clones)) # for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] for c in range(n_clones): - if np.sum(single_base_nb_mean[:,idx] > 0) > 0: - mu = np.exp(res["new_log_mu"][(pred%n_states),:]) / np.sum(np.exp(res["new_log_mu"][(pred%n_states),:]) * lambd) - weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / (np.mean(single_tumor_prop[idx]) * mu + 1 - np.mean(single_tumor_prop[idx])) + if np.sum(single_base_nb_mean[:, idx] > 0) > 0: + mu = np.exp(res["new_log_mu"][(pred % n_states), :]) / np.sum( + np.exp(res["new_log_mu"][(pred % n_states), :]) * lambd + ) + weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / ( + np.mean(single_tumor_prop[idx]) * mu + + 1 + - np.mean(single_tumor_prop[idx]) + ) else: - weighted_tp = np.repeat(np.mean(single_tumor_prop[idx]), single_X.shape[0]) - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), weighted_tp.reshape(-1,1) ) - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0) + weighted_tp = np.repeat( + np.mean(single_tumor_prop[idx]), single_X.shape[0] + ) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"][:, c : (c + 1)], + res["new_alphas"][:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"][:, c : (c + 1)], + res["new_taus"][:, c : (c + 1)], + np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]), + weighted_tp.reshape(-1, 1), + ) + ) + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, idx] > 0) + / np.sum(single_base_nb_mean[:, idx] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) + single_llf[i, c] = ratio_nonzeros * np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]) else: - single_llf[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) + single_llf[i, c] = np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]) # - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: if new_assignment[j] >= 0: # w_edge[new_assignment[j]] += 1 - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): +def hmrfmix_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] @@ -607,53 +1170,136 @@ def hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_b posterior = np.zeros((N, n_clones)) for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] for c in range(n_clones): if np.sum(single_base_nb_mean) > 0: - this_pred_cnv = res["pred_cnv"][:,c] - logmu_shift = np.array( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,c] + np.log(lambd), axis=0) ) - kwargs = {"logmu_shift":logmu_shift.reshape(1,1), "sample_length":np.array([n_obs])} + this_pred_cnv = res["pred_cnv"][:, c] + logmu_shift = np.array( + scipy.special.logsumexp( + res["new_log_mu"][this_pred_cnv, c] + np.log(lambd), axis=0 + ) + ) + kwargs = { + "logmu_shift": logmu_shift.reshape(1, 1), + "sample_length": np.array([n_obs]), + } else: kwargs = {} - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), **kwargs ) - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"][:, c : (c + 1)], + res["new_alphas"][:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"][:, c : (c + 1)], + res["new_taus"][:, c : (c + 1)], + np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]), + **kwargs, + ) + ) + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) + / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + single_llf[i, c] = ratio_nonzeros * np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) else: - single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) - - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: if new_assignment[j] >= 0: # w_edge[new_assignment[j]] += 1 - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ - coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", hmmclass=hmm_sitewise, params="stmp", t=1-1e-6, random_state=0, \ - init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\ - fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \ - is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0/6, tumorprop_threshold=0.5): +def hmrfmix_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + initial_clone_index, + n_states, + log_sitewise_transmat, + coords=None, + smooth_mat=None, + adjacency_mat=None, + sample_ids=None, + max_iter_outer=5, + nodepotential="max", + hmmclass=hmm_sitewise, + params="stmp", + t=1 - 1e-6, + random_state=0, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + fix_NB_dispersion=False, + shared_NB_dispersion=True, + fix_BB_dispersion=False, + shared_BB_dispersion=True, + is_diag=True, + max_iter=100, + tol=1e-4, + unit_xsquared=9, + unit_ysquared=3, + spatial_weight=1.0 / 6, + tumorprop_threshold=0.5, +): n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) # spot adjacency matric @@ -666,15 +1312,32 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin else: unique_sample_ids = np.unique(sample_ids) n_samples = len(unique_sample_ids) - tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))} - sample_ids = np.array([ tmp_map_index[x] for x in sample_ids]) + tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} + sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) # pseudobulk - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=tumorprop_threshold) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + single_tumor_prop, + threshold=tumorprop_threshold, + ) # initialize HMM parameters by GMM if (init_log_mu is None) or (init_p_binom is None): - init_log_mu, init_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params, random_state=random_state, in_log_space=False, only_minor=False) + init_log_mu, init_p_binom = initialization_by_gmm( + n_states, + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + params, + random_state=random_state, + in_log_space=False, + only_minor=False, + ) # initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu @@ -688,27 +1351,69 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin last_alphas = init_alphas last_taus = init_taus last_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c n_clones = len(initial_clone_index) # HMM for r in range(max_iter_outer): - allres = np.load(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True) + allres = np.load( + f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True + ) allres = dict(allres) if allres["num_iterations"] > r: - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } else: - res = {"new_log_mu":[], "new_alphas":[], "new_p_binom":[], "new_taus":[], "new_log_startprob":[], "new_log_transmat":[], "log_gamma":[], "pred_cnv":[], "llf":[]} + res = { + "new_log_mu": [], + "new_alphas": [], + "new_p_binom": [], + "new_taus": [], + "new_log_startprob": [], + "new_log_transmat": [], + "log_gamma": [], + "pred_cnv": [], + "llf": [], + } for c in range(n_clones): - tmpres = pipeline_baum_welch(None, X[:,:,c:(c+1)], lengths, n_states, base_nb_mean[:,c:(c+1)], total_bb_RD[:,c:(c+1)], log_sitewise_transmat, np.repeat(tumor_prop[c], X.shape[0]).reshape(-1,1), \ - hmmclass=hmmclass, params=params, t=t, \ - random_state=random_state, fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ - is_diag=is_diag, init_log_mu=last_log_mu[:,c:(c+1)], init_p_binom=last_p_binom[:,c:(c+1)], init_alphas=last_alphas[:,c:(c+1)], init_taus=last_taus[:,c:(c+1)], max_iter=max_iter, tol=tol) + tmpres = pipeline_baum_welch( + None, + X[:, :, c : (c + 1)], + lengths, + n_states, + base_nb_mean[:, c : (c + 1)], + total_bb_RD[:, c : (c + 1)], + log_sitewise_transmat, + np.repeat(tumor_prop[c], X.shape[0]).reshape(-1, 1), + hmmclass=hmmclass, + params=params, + t=t, + random_state=random_state, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=is_diag, + init_log_mu=last_log_mu[:, c : (c + 1)], + init_p_binom=last_p_binom[:, c : (c + 1)], + init_alphas=last_alphas[:, c : (c + 1)], + init_taus=last_taus[:, c : (c + 1)], + max_iter=max_iter, + tol=tol, + ) pred = np.argmax(tmpres["log_gamma"], axis=0) for k in res.keys(): res[k] = [res[k], tmpres[k]] @@ -723,18 +1428,43 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin # clone assignmment if nodepotential == "max": - new_assignment, single_llf, total_llf = aggr_hmrfmix_reassignment(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = aggr_hmrfmix_reassignment( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + pred, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) elif nodepotential == "weighted_sum": - new_assignment, single_llf, total_llf = hmrfmix_reassignment_posterior(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = hmrfmix_reassignment_posterior( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) else: raise Exception("Unknown mode for nodepotential!") # handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) - re_indexing = {c:i for i,c in enumerate(remaining_clones)} + re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) # res["prev_assignment"] = last_assignment @@ -742,7 +1472,7 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin res["total_llf"] = total_llf # append to allres - for k,v in res.items(): + for k, v in res.items(): if k == "prev_assignment": allres[f"round{r-1}_assignment"] = v elif k == "new_assignment": @@ -753,19 +1483,55 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) # regroup to pseudobulk - clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))] - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=tumorprop_threshold) + clone_index = [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ] + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + threshold=tumorprop_threshold, + ) # update last parameter if "mp" in params: - print("outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) + print( + "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( + r, + res["total_llf"], + np.mean(np.abs(last_log_mu - res["new_log_mu"])), + np.mean(np.abs(last_p_binom - res["new_p_binom"])), + ) + ) elif "m" in params: - print("outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu-res["new_log_mu"])) )) + print( + "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( + r, + res["total_llf"], + np.mean(np.abs(last_log_mu - res["new_log_mu"])), + ) + ) elif "p" in params: - print("outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) - print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) )) + print( + "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( + r, + res["total_llf"], + np.mean(np.abs(last_p_binom - res["new_p_binom"])), + ) + ) + print( + "outer iteration {}: ARI between assignment = {}".format( + r, adjusted_rand_score(last_assignment, res["new_assignment"]) + ) + ) # if np.all( last_assignment == res["new_assignment"] ): - if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1: + if ( + adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 + or len(np.unique(res["new_assignment"])) == 1 + ): break last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] @@ -775,12 +1541,32 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] - this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index) - log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) - log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) - - -def hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): + this_persample_weight = np.bincount( + res["new_assignment"][index], minlength=X.shape[2] + ) / len(index) + log_persample_weights[:, sidx] = np.where( + this_persample_weight > 0, np.log(this_persample_weight), -50 + ) + log_persample_weights[:, sidx] = log_persample_weights[ + :, sidx + ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) + + +def hmrfmix_reassignment_posterior_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = np.max(prev_assignment) + 1 @@ -792,52 +1578,128 @@ def hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, si if np.sum(single_base_nb_mean) > 0: logmu_shift = [] for c in range(n_clones): - this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states - logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) ) + this_pred_cnv = ( + np.argmax( + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], axis=0 + ) + % n_states + ) + logmu_shift.append( + scipy.special.logsumexp( + res["new_log_mu"][this_pred_cnv, :] + np.log(lambd).reshape(-1, 1), + axis=0, + ) + ) logmu_shift = np.vstack(logmu_shift) - kwargs = {"logmu_shift":logmu_shift, "sample_length":np.ones(n_clones,dtype=int) * n_obs} + kwargs = { + "logmu_shift": logmu_shift, + "sample_length": np.ones(n_clones, dtype=int) * n_obs, + } else: kwargs = {} # posterior = np.zeros((N, n_clones)) for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] for c in range(n_clones): - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), **kwargs ) - - if np.sum(single_base_nb_mean[:,i:(i+1)] > 0) > 0 and np.sum(single_total_bb_RD[:,i:(i+1)] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"], + res["new_alphas"], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"], + res["new_taus"], + np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]), + **kwargs, + ) + ) + + if ( + np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) > 0 + and np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) + / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + single_llf[i, c] = ratio_nonzeros * np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) else: - single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \ - np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + + res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)], + axis=0, + ) + ) + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: # w_edge[new_assignment[j]] += 1 - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def aggr_hmrfmix_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False): +def aggr_hmrfmix_reassignment_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + pred, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, + return_posterior=False, +): N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = int(len(pred) / n_obs) @@ -850,50 +1712,120 @@ def aggr_hmrfmix_reassignment_concatenate(single_X, single_base_nb_mean, single_ posterior = np.zeros((N, n_clones)) # for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] for c in range(n_clones): - this_pred = pred[(c*n_obs):(c*n_obs+n_obs)] - if np.sum(single_base_nb_mean[:,idx] > 0) > 0: - mu = np.exp(res["new_log_mu"][(this_pred%n_states),:]) / np.sum(np.exp(res["new_log_mu"][(this_pred%n_states),:]) * lambd) - weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / (np.mean(single_tumor_prop[idx]) * mu + 1 - np.mean(single_tumor_prop[idx])) + this_pred = pred[(c * n_obs) : (c * n_obs + n_obs)] + if np.sum(single_base_nb_mean[:, idx] > 0) > 0: + mu = np.exp(res["new_log_mu"][(this_pred % n_states), :]) / np.sum( + np.exp(res["new_log_mu"][(this_pred % n_states), :]) * lambd + ) + weighted_tp = (np.mean(single_tumor_prop[idx]) * mu) / ( + np.mean(single_tumor_prop[idx]) * mu + + 1 + - np.mean(single_tumor_prop[idx]) + ) else: - weighted_tp = np.repeat(np.mean(single_tumor_prop[idx]), single_X.shape[0]) - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), weighted_tp.reshape(-1,1) ) - - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0) + weighted_tp = np.repeat( + np.mean(single_tumor_prop[idx]), single_X.shape[0] + ) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"], + res["new_alphas"], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"], + res["new_taus"], + np.ones((n_obs, 1)) * np.mean(single_tumor_prop[idx]), + weighted_tp.reshape(-1, 1), + ) + ) + + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, idx] > 0) + / np.sum(single_base_nb_mean[:, idx] > 0) + ) # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) + single_llf[i, c] = ratio_nonzeros * np.sum( + tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) else: - single_llf[i,c] = np.sum(tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) - w_node = single_llf[i,:] - w_node += log_persample_weights[:,sample_ids[i]] + single_llf[i, c] = np.sum( + tmp_log_emission_rdr[this_pred, np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[this_pred, np.arange(n_obs), 0]) + w_node = single_llf[i, :] + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: # w_edge[new_assignment[j]] += 1 - w_edge[new_assignment[j]] += adjacency_mat[i,j] - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) + w_edge[new_assignment[j]] += adjacency_mat[i, j] + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) # - posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + posterior[i, :] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) # # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) if return_posterior: return new_assignment, single_llf, total_llf, posterior else: return new_assignment, single_llf, total_llf -def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ - coords=None, smooth_mat=None, adjacency_mat=None, sample_ids=None, max_iter_outer=5, nodepotential="max", hmmclass=hmm_sitewise, params="stmp", t=1-1e-6, random_state=0, \ - init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None,\ - fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \ - is_diag=True, max_iter=100, tol=1e-4, unit_xsquared=9, unit_ysquared=3, spatial_weight=1.0/6, tumorprop_threshold=0.5): +def hmrfmix_concatenate_pipeline( + outdir, + prefix, + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + initial_clone_index, + n_states, + log_sitewise_transmat, + coords=None, + smooth_mat=None, + adjacency_mat=None, + sample_ids=None, + max_iter_outer=5, + nodepotential="max", + hmmclass=hmm_sitewise, + params="stmp", + t=1 - 1e-6, + random_state=0, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + fix_NB_dispersion=False, + shared_NB_dispersion=True, + fix_BB_dispersion=False, + shared_BB_dispersion=True, + is_diag=True, + max_iter=100, + tol=1e-4, + unit_xsquared=9, + unit_ysquared=3, + spatial_weight=1.0 / 6, + tumorprop_threshold=0.5, +): n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) # spot adjacency matric @@ -906,17 +1838,34 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_ else: unique_sample_ids = np.unique(sample_ids) n_samples = len(unique_sample_ids) - tmp_map_index = {unique_sample_ids[i]:i for i in range(len(unique_sample_ids))} - sample_ids = np.array([ tmp_map_index[x] for x in sample_ids]) + tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} + sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * (-np.log(n_clones)) # pseudobulk - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=tumorprop_threshold) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + single_tumor_prop, + threshold=tumorprop_threshold, + ) # baseline proportion of UMI counts lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) # initialize HMM parameters by GMM if (init_log_mu is None) or (init_p_binom is None): - init_log_mu, init_p_binom = initialization_by_gmm(n_states, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), params, random_state=random_state, in_log_space=False, only_minor=False) + init_log_mu, init_p_binom = initialization_by_gmm( + n_states, + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + params, + random_state=random_state, + in_log_space=False, + only_minor=False, + ) # initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu @@ -930,56 +1879,120 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_ last_alphas = init_alphas last_taus = init_taus last_assignment = np.zeros(single_X.shape[2], dtype=int) - for c,idx in enumerate(initial_clone_index): + for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c # HMM for r in range(max_iter_outer): # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization - allres = np.load(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True) + allres = np.load( + f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True + ) allres = dict(allres) if allres["num_iterations"] > r: - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } else: - sample_length = np.ones(X.shape[2],dtype=int) * X.shape[0] - remain_kwargs = {"sample_length":sample_length, "lambd":lambd} + sample_length = np.ones(X.shape[2], dtype=int) * X.shape[0] + remain_kwargs = {"sample_length": sample_length, "lambd": lambd} if f"round{r-1}_log_gamma" in allres: remain_kwargs["log_gamma"] = allres[f"round{r-1}_log_gamma"] - res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), n_states, \ - # base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), tumor_prop, \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), np.repeat(tumor_prop, X.shape[0]).reshape(-1,1), \ - hmmclass=hmmclass, params=params, t=t, random_state=random_state, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ - is_diag=is_diag, init_log_mu=last_log_mu, init_p_binom=last_p_binom, init_alphas=last_alphas, init_taus=last_taus, max_iter=max_iter, tol=tol, **remain_kwargs) + res = pipeline_baum_welch( + None, + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + np.tile(lengths, X.shape[2]), + n_states, # base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), tumor_prop, \ + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + np.tile(log_sitewise_transmat, X.shape[2]), + np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1), + hmmclass=hmmclass, + params=params, + t=t, + random_state=random_state, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=is_diag, + init_log_mu=last_log_mu, + init_p_binom=last_p_binom, + init_alphas=last_alphas, + init_taus=last_taus, + max_iter=max_iter, + tol=tol, + **remain_kwargs, + ) pred = np.argmax(res["log_gamma"], axis=0) # clone assignmment if nodepotential == "max": - new_assignment, single_llf, total_llf = aggr_hmrfmix_reassignment_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, pred, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = ( + aggr_hmrfmix_reassignment_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + pred, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) + ) elif nodepotential == "weighted_sum": - new_assignment, single_llf, total_llf = hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, \ - smooth_mat, adjacency_mat, last_assignment, sample_ids, log_persample_weights, spatial_weight=spatial_weight, hmmclass=hmmclass) + new_assignment, single_llf, total_llf = ( + hmrfmix_reassignment_posterior_concatenate( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + res, + smooth_mat, + adjacency_mat, + last_assignment, + sample_ids, + log_persample_weights, + spatial_weight=spatial_weight, + hmmclass=hmmclass, + ) + ) else: raise Exception("Unknown mode for nodepotential!") # handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) - re_indexing = {c:i for i,c in enumerate(remaining_clones)} + re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) - concat_idx = np.concatenate([ np.arange(c*n_obs, c*n_obs+n_obs) for c in remaining_clones ]) - res["log_gamma"] = res["log_gamma"][:,concat_idx] + concat_idx = np.concatenate( + [np.arange(c * n_obs, c * n_obs + n_obs) for c in remaining_clones] + ) + res["log_gamma"] = res["log_gamma"][:, concat_idx] res["pred_cnv"] = res["pred_cnv"][concat_idx] # add to results res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf # append to allres - for k,v in res.items(): + for k, v in res.items(): if k == "prev_assignment": allres[f"round{r-1}_assignment"] = v elif k == "new_assignment": @@ -990,18 +2003,49 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_ np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) # # regroup to pseudobulk - clone_index = [np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"]))] - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=tumorprop_threshold) + clone_index = [ + np.where(res["new_assignment"] == c)[0] + for c in np.sort(np.unique(res["new_assignment"])) + ] + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + threshold=tumorprop_threshold, + ) # if "mp" in params: - print("outer iteration {}: difference between parameters = {}, {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])), np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) + print( + "outer iteration {}: difference between parameters = {}, {}".format( + r, + np.mean(np.abs(last_log_mu - res["new_log_mu"])), + np.mean(np.abs(last_p_binom - res["new_p_binom"])), + ) + ) elif "m" in params: - print("outer iteration {}: difference between NB parameters = {}".format( r, np.mean(np.abs(last_log_mu-res["new_log_mu"])) )) + print( + "outer iteration {}: difference between NB parameters = {}".format( + r, np.mean(np.abs(last_log_mu - res["new_log_mu"])) + ) + ) elif "p" in params: - print("outer iteration {}: difference between BetaBinom parameters = {}".format( r, np.mean(np.abs(last_p_binom-res["new_p_binom"])) )) - print("outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) )) + print( + "outer iteration {}: difference between BetaBinom parameters = {}".format( + r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) + ) + ) + print( + "outer iteration {}: ARI between assignment = {}".format( + r, adjusted_rand_score(last_assignment, res["new_assignment"]) + ) + ) # if np.all( last_assignment == res["new_assignment"] ): - if adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1: + if ( + adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 + or len(np.unique(res["new_assignment"])) == 1 + ): break last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] @@ -1011,15 +2055,37 @@ def hmrfmix_concatenate_pipeline(outdir, prefix, single_X, lengths, single_base_ log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] - this_persample_weight = np.bincount(res["new_assignment"][index], minlength=X.shape[2]) / len(index) - log_persample_weights[:, sidx] = np.where(this_persample_weight > 0, np.log(this_persample_weight), -50) - log_persample_weights[:, sidx] = log_persample_weights[:, sidx] - scipy.special.logsumexp(log_persample_weights[:, sidx]) + this_persample_weight = np.bincount( + res["new_assignment"][index], minlength=X.shape[2] + ) / len(index) + log_persample_weights[:, sidx] = np.where( + this_persample_weight > 0, np.log(this_persample_weight), -50 + ) + log_persample_weights[:, sidx] = log_persample_weights[ + :, sidx + ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) ############################################################ # Final posterior using integer copy numbers ############################################################ -def clonelabel_posterior_withinteger(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, state_cnv, res, pred, smooth_mat, adjacency_mat, prev_assignment, sample_ids, base_nb_mean, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise): +def clonelabel_posterior_withinteger( + single_X, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + state_cnv, + res, + pred, + smooth_mat, + adjacency_mat, + prev_assignment, + sample_ids, + base_nb_mean, + log_persample_weights, + spatial_weight, + hmmclass=hmm_sitewise, +): """ single_X : array, (n_obs, 2, n_spots) @@ -1046,74 +2112,156 @@ def clonelabel_posterior_withinteger(single_X, single_base_nb_mean, single_total N = single_X.shape[2] n_obs = single_X.shape[0] # clone IDs - tmp_clone_ids = np.array([x[5:].split(" ")[0] for x in state_cnv.columns if x[:5] == "clone"]) - clone_ids = np.array([x for i,x in enumerate(tmp_clone_ids) if i == 0 or x != tmp_clone_ids[i-1]]) + tmp_clone_ids = np.array( + [x[5:].split(" ")[0] for x in state_cnv.columns if x[:5] == "clone"] + ) + clone_ids = np.array( + [x for i, x in enumerate(tmp_clone_ids) if i == 0 or x != tmp_clone_ids[i - 1]] + ) n_clones = len(clone_ids) n_states = state_cnv.shape[0] # parameter based on integer copy numbers - lambd = base_nb_mean / np.sum(base_nb_mean, axis=0, keepdims=True) if n_clones == base_nb_mean.shape[1] else base_nb_mean[:,1:] / np.sum(base_nb_mean[:,1:], axis=0, keepdims=True) + lambd = ( + base_nb_mean / np.sum(base_nb_mean, axis=0, keepdims=True) + if n_clones == base_nb_mean.shape[1] + else base_nb_mean[:, 1:] / np.sum(base_nb_mean[:, 1:], axis=0, keepdims=True) + ) log_mu_icn = np.zeros((n_states, n_clones)) - for c,cid in enumerate(clone_ids): - log_mu_icn[:,c] = np.log( (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"]) / lambd[:,c].dot( (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"])[pred[:,c]] ) ) - p_binom_icn = np.array([ state_cnv[f"clone{cid} A"] / (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"]) for cid in clone_ids ]).T + for c, cid in enumerate(clone_ids): + log_mu_icn[:, c] = np.log( + (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"]) + / lambd[:, c].dot( + (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"])[pred[:, c]] + ) + ) + p_binom_icn = np.array( + [ + state_cnv[f"clone{cid} A"] + / (state_cnv[f"clone{cid} A"] + state_cnv[f"clone{cid} B"]) + for cid in clone_ids + ] + ).T # handle 0 in p_binom_icn if n_clones == res["new_p_binom"].shape[1]: - p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][((p_binom_icn == 0) | (p_binom_icn == 1))] + p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][ + ((p_binom_icn == 0) | (p_binom_icn == 1)) + ] elif n_clones + 1 == res["new_p_binom"].shape[1]: - p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][:,1:][((p_binom_icn == 0) | (p_binom_icn == 1))] + p_binom_icn[((p_binom_icn == 0) | (p_binom_icn == 1))] = res["new_p_binom"][ + :, 1: + ][((p_binom_icn == 0) | (p_binom_icn == 1))] # over-dispersion - new_alphas = copy.copy(res["new_alphas"]) if n_clones == res["new_p_binom"].shape[1] else copy.copy(res["new_alphas"][:,1:]) - new_alphas[:,:] = np.max(new_alphas) - new_taus = copy.copy(res["new_taus"]) if n_clones == res["new_p_binom"].shape[1] else copy.copy(res["new_taus"][:,1:]) - new_taus[:,:] = np.min(new_taus) + new_alphas = ( + copy.copy(res["new_alphas"]) + if n_clones == res["new_p_binom"].shape[1] + else copy.copy(res["new_alphas"][:, 1:]) + ) + new_alphas[:, :] = np.max(new_alphas) + new_taus = ( + copy.copy(res["new_taus"]) + if n_clones == res["new_p_binom"].shape[1] + else copy.copy(res["new_taus"][:, 1:]) + ) + new_taus[:, :] = np.min(new_taus) # result variables single_llf_rdr = np.zeros((N, n_clones)) single_llf_baf = np.zeros((N, n_clones)) single_llf = np.zeros((N, n_clones)) - df_posterior = pd.DataFrame({k:np.zeros(N) for k in [f"post_BAF_clone_{cid}" for cid in clone_ids] + [f"post_RDR_clone_{cid}" for cid in clone_ids] + \ - [f"post_nodellf_clone_{cid}" for cid in clone_ids] + [f"post_combine_clone_{cid}" for cid in clone_ids] }) + df_posterior = pd.DataFrame( + { + k: np.zeros(N) + for k in [f"post_BAF_clone_{cid}" for cid in clone_ids] + + [f"post_RDR_clone_{cid}" for cid in clone_ids] + + [f"post_nodellf_clone_{cid}" for cid in clone_ids] + + [f"post_combine_clone_{cid}" for cid in clone_ids] + } + ) # for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] + idx = smooth_mat[i, :].nonzero()[1] if not (single_tumor_prop is None): idx = idx[~np.isnan(single_tumor_prop[idx])] for c in range(n_clones): if single_tumor_prop is None: - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), log_mu_icn[:,c:(c+1)], new_alphas[:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), p_binom_icn[:,c:(c+1)], new_taus[:,c:(c+1)] ) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + log_mu_icn[:, c : (c + 1)], + new_alphas[:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + p_binom_icn[:, c : (c + 1)], + new_taus[:, c : (c + 1)], + ) + ) else: - tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), log_mu_icn[:,c:(c+1)], new_alphas[:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), p_binom_icn[:,c:(c+1)], new_taus[:,c:(c+1)], np.repeat(np.mean(single_tumor_prop[idx]), single_X.shape[0]).reshape(-1,1) ) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + hmmclass.compute_emission_probability_nb_betabinom_mix( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + log_mu_icn[:, c : (c + 1)], + new_alphas[:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + p_binom_icn[:, c : (c + 1)], + new_taus[:, c : (c + 1)], + np.repeat( + np.mean(single_tumor_prop[idx]), single_X.shape[0] + ).reshape(-1, 1), + ) + ) assert not np.any(np.isnan(tmp_log_emission_rdr)) assert not np.any(np.isnan(tmp_log_emission_baf)) # !!! tmp_log_emission_baf may be NAN # Because LoH leads to Beta-binomial p = 0 or 1, but both A and B alleles are observed in the data, which leads to Nan. # We don't directly model the erroneous measurements associated with LoH. # - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,idx] > 0) / np.sum(single_base_nb_mean[:,idx] > 0) - single_llf_rdr[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) - single_llf_baf[i,c] = np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) - single_llf[i,c] = single_llf_rdr[i,c] + single_llf_baf[i,c] + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(single_total_bb_RD[:, idx] > 0) + / np.sum(single_base_nb_mean[:, idx] > 0) + ) + single_llf_rdr[i, c] = ratio_nonzeros * np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + single_llf_baf[i, c] = np.sum( + tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0] + ) + single_llf[i, c] = single_llf_rdr[i, c] + single_llf_baf[i, c] else: - single_llf_rdr[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) - single_llf_baf[i,c] = np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) - single_llf[i,c] = single_llf_rdr[i,c] + single_llf_baf[i,c] - - w_node = copy.copy(single_llf[i,:]) - w_node += log_persample_weights[:,sample_ids[i]] + single_llf_rdr[i, c] = np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + single_llf_baf[i, c] = np.sum( + tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0] + ) + single_llf[i, c] = single_llf_rdr[i, c] + single_llf_baf[i, c] + + w_node = copy.copy(single_llf[i, :]) + w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: if n_clones == base_nb_mean.shape[1]: - w_edge[prev_assignment[j]] += adjacency_mat[i,j] + w_edge[prev_assignment[j]] += adjacency_mat[i, j] else: - w_edge[prev_assignment[j] - 1] += adjacency_mat[i,j] + w_edge[prev_assignment[j] - 1] += adjacency_mat[i, j] # - df_posterior.iloc[i,:n_clones] = np.exp( single_llf_baf[i,:] - scipy.special.logsumexp(single_llf_baf[i,:]) ) - df_posterior.iloc[i,n_clones:(2*n_clones)] = np.exp( single_llf_rdr[i,:] - scipy.special.logsumexp(single_llf_rdr[i,:]) ) - df_posterior.iloc[i,(2*n_clones):(3*n_clones)] = np.exp( single_llf[i,:] - scipy.special.logsumexp(single_llf[i,:]) ) - df_posterior.iloc[i,(3*n_clones):(4*n_clones)] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + df_posterior.iloc[i, :n_clones] = np.exp( + single_llf_baf[i, :] - scipy.special.logsumexp(single_llf_baf[i, :]) + ) + df_posterior.iloc[i, n_clones : (2 * n_clones)] = np.exp( + single_llf_rdr[i, :] - scipy.special.logsumexp(single_llf_rdr[i, :]) + ) + df_posterior.iloc[i, (2 * n_clones) : (3 * n_clones)] = np.exp( + single_llf[i, :] - scipy.special.logsumexp(single_llf[i, :]) + ) + df_posterior.iloc[i, (3 * n_clones) : (4 * n_clones)] = np.exp( + w_node + + spatial_weight * w_edge + - scipy.special.logsumexp(w_node + spatial_weight * w_edge) + ) return df_posterior diff --git a/src/calicost/hmrf_normalmixture.py b/src/calicost/hmrf_normalmixture.py index af68580..5c05f0e 100644 --- a/src/calicost/hmrf_normalmixture.py +++ b/src/calicost/hmrf_normalmixture.py @@ -15,4 +15,3 @@ import warnings from statsmodels.tools.sm_exceptions import ValueWarning - diff --git a/src/calicost/joint_allele_generateconfig.py b/src/calicost/joint_allele_generateconfig.py index 7a16294..9898135 100644 --- a/src/calicost/joint_allele_generateconfig.py +++ b/src/calicost/joint_allele_generateconfig.py @@ -19,118 +19,120 @@ def read_joint_configuration_file(filename): ##### [Default settings] ##### config = { - "input_filelist" : None, - "snp_dir" : None, - "output_dir" : None, + "input_filelist": None, + "snp_dir": None, + "output_dir": None, # supporting files and preprocessing arguments - "hgtable_file" : None, - "normalidx_file" : None, - "tumorprop_file" : None, - "supervision_clone_file" : None, - "alignment_files" : [], - "filtergenelist_file" : None, - "filterregion_file" : None, - "binsize" : 1, - "rdrbinsize" : 1, + "hgtable_file": None, + "normalidx_file": None, + "tumorprop_file": None, + "supervision_clone_file": None, + "alignment_files": [], + "filtergenelist_file": None, + "filterregion_file": None, + "binsize": 1, + "rdrbinsize": 1, # "secondbinning_min_umi" : 500, - "max_nbins" : 1200, - "avg_umi_perbinspot" : 1.5, - "bafonly" : True, + "max_nbins": 1200, + "avg_umi_perbinspot": 1.5, + "bafonly": True, # phase switch probability - "nu" : 1, - "logphase_shift" : 1, - "npart_phasing" : 2, + "nu": 1, + "logphase_shift": 1, + "npart_phasing": 2, # HMRF configurations - "n_clones" : None, - "n_clones_rdr" : 2, - "min_spots_per_clone" : 100, - "min_avgumi_per_clone" : 10, - "maxspots_pooling" : 7, - "tumorprop_threshold" : 0.5, - "max_iter_outer" : 20, - "nodepotential" : "max", # max or weighted_sum - "initialization_method" : "rectangle", # rectangle or datadrive - "num_hmrf_initialization_start" : 0, - "num_hmrf_initialization_end" : 10, - "spatial_weight" : 2.0, - "construct_adjacency_method" : "hexagon", - "construct_adjacency_w" : 1.0, + "n_clones": None, + "n_clones_rdr": 2, + "min_spots_per_clone": 100, + "min_avgumi_per_clone": 10, + "maxspots_pooling": 7, + "tumorprop_threshold": 0.5, + "max_iter_outer": 20, + "nodepotential": "max", # max or weighted_sum + "initialization_method": "rectangle", # rectangle or datadrive + "num_hmrf_initialization_start": 0, + "num_hmrf_initialization_end": 10, + "spatial_weight": 2.0, + "construct_adjacency_method": "hexagon", + "construct_adjacency_w": 1.0, # HMM configurations - "n_states" : None, - "params" : None, - "t" : None, - "t_phaseing" : 1-1e-4, - "fix_NB_dispersion" : False, - "shared_NB_dispersion" : True, - "fix_BB_dispersion" : False, - "shared_BB_dispersion" : True, - "max_iter" : 30, - "tol" : 1e-3, - "gmm_random_state" : 0, - "np_threshold" : 2.0, - "np_eventminlen" : 10 + "n_states": None, + "params": None, + "t": None, + "t_phaseing": 1 - 1e-4, + "fix_NB_dispersion": False, + "shared_NB_dispersion": True, + "fix_BB_dispersion": False, + "shared_BB_dispersion": True, + "max_iter": 30, + "tol": 1e-3, + "gmm_random_state": 0, + "np_threshold": 2.0, + "np_eventminlen": 10, } argument_type = { - "input_filelist" : "str", - "snp_dir" : "str", - "output_dir" : "str", + "input_filelist": "str", + "snp_dir": "str", + "output_dir": "str", # supporting files and preprocessing arguments - "hgtable_file" : "str", - "normalidx_file" : "str", - "tumorprop_file" : "str", - "supervision_clone_file" : "str", - "alignment_files" : "list_str", - "filtergenelist_file" : "str", - "filterregion_file" : "str", - "binsize" : "int", - "rdrbinsize" : "int", + "hgtable_file": "str", + "normalidx_file": "str", + "tumorprop_file": "str", + "supervision_clone_file": "str", + "alignment_files": "list_str", + "filtergenelist_file": "str", + "filterregion_file": "str", + "binsize": "int", + "rdrbinsize": "int", # "secondbinning_min_umi" : "int", - "max_nbins" : "int", - "avg_umi_perbinspot" : "float", - "bafonly" : "bool", + "max_nbins": "int", + "avg_umi_perbinspot": "float", + "bafonly": "bool", # phase switch probability - "nu" : "float", - "logphase_shift" : "float", - "npart_phasing" : "int", + "nu": "float", + "logphase_shift": "float", + "npart_phasing": "int", # HMRF configurations - "n_clones" : "int", - "n_clones_rdr" : "int", - "min_spots_per_clone" : "int", - "min_avgumi_per_clone" : "int", - "maxspots_pooling" : "int", - "tumorprop_threshold" : "float", - "max_iter_outer" : "int", - "nodepotential" : "str", - "initialization_method" : "str", - "num_hmrf_initialization_start" : "int", - "num_hmrf_initialization_end" : "int", - "spatial_weight" : "float", - "construct_adjacency_method" : "str", - "construct_adjacency_w" : "float", + "n_clones": "int", + "n_clones_rdr": "int", + "min_spots_per_clone": "int", + "min_avgumi_per_clone": "int", + "maxspots_pooling": "int", + "tumorprop_threshold": "float", + "max_iter_outer": "int", + "nodepotential": "str", + "initialization_method": "str", + "num_hmrf_initialization_start": "int", + "num_hmrf_initialization_end": "int", + "spatial_weight": "float", + "construct_adjacency_method": "str", + "construct_adjacency_w": "float", # HMM configurations - "n_states" : "int", - "params" : "str", - "t" : "eval", - "t_phaseing" : "eval", - "fix_NB_dispersion" : "bool", - "shared_NB_dispersion" : "bool", - "fix_BB_dispersion" : "bool", - "shared_BB_dispersion" : "bool", - "max_iter" : "int", - "tol" : "float", - "gmm_random_state" : "int", - "np_threshold" : "float", - "np_eventminlen" : "int" + "n_states": "int", + "params": "str", + "t": "eval", + "t_phaseing": "eval", + "fix_NB_dispersion": "bool", + "shared_NB_dispersion": "bool", + "fix_BB_dispersion": "bool", + "shared_BB_dispersion": "bool", + "max_iter": "int", + "tol": "float", + "gmm_random_state": "int", + "np_threshold": "float", + "np_eventminlen": "int", } ##### [ read configuration file to update settings ] ##### - with open(filename, 'r') as fp: + with open(filename, "r") as fp: for line in fp: if line.strip() == "" or line[0] == "#": continue strs = [x.strip() for x in line.strip().split(":") if x != ""] - assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" + assert ( + strs[0] in config.keys() + ), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" if len(strs) == 1: config[strs[0]] = [] elif strs[1].upper() == "NONE": @@ -144,7 +146,7 @@ def read_joint_configuration_file(filename): elif argument_type[strs[0]] == "eval": config[strs[0]] = eval(strs[1]) elif argument_type[strs[0]] == "bool": - config[strs[0]] = (strs[1].upper() == "TRUE") + config[strs[0]] = strs[1].upper() == "TRUE" elif argument_type[strs[0]] == "list_str": config[strs[0]] = strs[1].split(" ") # assertions @@ -155,12 +157,10 @@ def read_joint_configuration_file(filename): return config - def write_joint_config_file(outputfilename, config): - list_argument_io = ["input_filelist", - "snp_dir", - "output_dir"] - list_argument_sup = ["hgtable_file", + list_argument_io = ["input_filelist", "snp_dir", "output_dir"] + list_argument_sup = [ + "hgtable_file", "normalidx_file", "tumorprop_file", "supervision_clone_file", @@ -172,11 +172,11 @@ def write_joint_config_file(outputfilename, config): # "secondbinning_min_umi", "max_nbins", "avg_umi_perbinspot", - "bafonly"] - list_argument_phase = ["nu", - "logphase_shift", - "npart_phasing"] - list_argument_hmrf = ["n_clones", + "bafonly", + ] + list_argument_phase = ["nu", "logphase_shift", "npart_phasing"] + list_argument_hmrf = [ + "n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", @@ -185,12 +185,14 @@ def write_joint_config_file(outputfilename, config): "max_iter_outer", "nodepotential", "initialization_method", - "num_hmrf_initialization_start", + "num_hmrf_initialization_start", "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", - "construct_adjacency_w"] - list_argument_hmm = ["n_states", + "construct_adjacency_w", + ] + list_argument_hmm = [ + "n_states", "params", "t", "t_phaseing", @@ -202,8 +204,9 @@ def write_joint_config_file(outputfilename, config): "tol", "gmm_random_state", "np_threshold", - "np_eventminlen"] - with open(outputfilename, 'w') as fp: + "np_eventminlen", + ] + with open(outputfilename, "w") as fp: # for k in list_argument_io: fp.write(f"{k} : {config[k]}\n") @@ -240,12 +243,14 @@ def main(argv): config = read_joint_configuration_file(template_configuration_file) for r in range(hmrf_seed_s, hmrf_seed_t): config["num_hmrf_initialization_start"] = r - config["num_hmrf_initialization_end"] = r+1 + config["num_hmrf_initialization_end"] = r + 1 write_joint_config_file(f"{outputdir}/configfile{r}", config) - + if __name__ == "__main__": if len(sys.argv) == 1: - print("python joint_allele_generateconfig.py ") + print( + "python joint_allele_generateconfig.py " + ) if len(sys.argv) > 1: - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/src/calicost/oldcode.py b/src/calicost/oldcode.py index 217dc49..88ec5fa 100644 --- a/src/calicost/oldcode.py +++ b/src/calicost/oldcode.py @@ -10,8 +10,18 @@ # M step related ############################################################ -def update_emission_params_nb_sitewise(X_nb, log_gamma, base_nb_mean, alphas, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): + +def update_emission_params_nb_sitewise( + X_nb, + log_gamma, + base_nb_mean, + alphas, + start_log_mu=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + min_log_rdr=-2, + max_log_rdr=2, +): """ Attributes ---------- @@ -32,59 +42,133 @@ def update_emission_params_nb_sitewise(X_nb, log_gamma, base_nb_mean, alphas, \ new_log_mu = np.zeros((n_states, n_spots)) new_alphas = alphas for s in range(n_spots): - idx_nonzero = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero = np.where(base_nb_mean[:, s] > 0)[0] for i in range(n_states): - model = sm.GLM(X_nb[idx_nonzero,s], np.ones(len(idx_nonzero)).reshape(-1,1), \ - family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \ - exposure=base_nb_mean[idx_nonzero,s], var_weights=gamma[i,idx_nonzero]+gamma[i+n_states,idx_nonzero]) + model = sm.GLM( + X_nb[idx_nonzero, s], + np.ones(len(idx_nonzero)).reshape(-1, 1), + family=sm.families.NegativeBinomial(alpha=alphas[i, s]), + exposure=base_nb_mean[idx_nonzero, s], + var_weights=gamma[i, idx_nonzero] + + gamma[i + n_states, idx_nonzero], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] # print(s, i, res.params) if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array([start_log_mu[i, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if -model.loglike(res.params) < -model.loglike(res2.params) + else res2.params[0] + ) else: new_log_mu = np.zeros((n_states, n_spots)) new_alphas = np.zeros((n_states, n_spots)) if not shared_NB_dispersion: for s in range(n_spots): - idx_nonzero = np.where(base_nb_mean[:,s] > 0)[0] + idx_nonzero = np.where(base_nb_mean[:, s] > 0)[0] for i in range(n_states): - model = Weighted_NegativeBinomial(X_nb[idx_nonzero,s], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=gamma[i,idx_nonzero]+gamma[i+n_states,idx_nonzero], exposure=base_nb_mean[idx_nonzero,s]) + model = Weighted_NegativeBinomial( + X_nb[idx_nonzero, s], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=gamma[i, idx_nonzero] + + gamma[i + n_states, idx_nonzero], + exposure=base_nb_mean[idx_nonzero, s], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] new_alphas[i, s] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_log_mu[i, s]], [alphas[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_alphas[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: for s in range(n_spots): - idx_nonzero = np.where(base_nb_mean[:,s] > 0)[0] - all_states_nb_mean = np.tile(base_nb_mean[idx_nonzero,s], n_states) - all_states_y = np.tile(X_nb[idx_nonzero,s], n_states) - all_states_weights = np.concatenate([gamma[i,idx_nonzero]+gamma[i+n_states,idx_nonzero] for i in range(n_states)]) - all_states_features = np.zeros((n_states*len(idx_nonzero), n_states)) + idx_nonzero = np.where(base_nb_mean[:, s] > 0)[0] + all_states_nb_mean = np.tile(base_nb_mean[idx_nonzero, s], n_states) + all_states_y = np.tile(X_nb[idx_nonzero, s], n_states) + all_states_weights = np.concatenate( + [ + gamma[i, idx_nonzero] + gamma[i + n_states, idx_nonzero] + for i in range(n_states) + ] + ) + all_states_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - all_states_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 - model = Weighted_NegativeBinomial(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_nb_mean) + all_states_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 + model = Weighted_NegativeBinomial( + all_states_y, + all_states_features, + weights=all_states_weights, + exposure=all_states_nb_mean, + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_log_mu[:,s] = res.params[:-1] - new_alphas[:,s] = res.params[-1] + new_log_mu[:, s] = res.params[:-1] + new_alphas[:, s] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append(start_log_mu[:,s], [alphas[0,s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[:,s] = res.params[:-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[:-1] - new_alphas[:,s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append(start_log_mu[:, s], [alphas[0, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[:, s] = ( + res.params[:-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[:-1] + ) + new_alphas[:, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas -def update_emission_params_bb_sitewise(X_bb, log_gamma, total_bb_RD, taus, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): +def update_emission_params_bb_sitewise( + X_bb, + log_gamma, + total_bb_RD, + taus, + start_p_binom=None, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + percent_threshold=0.99, + min_binom_prob=0.01, + max_binom_prob=0.99, +): """ Attributes ---------- @@ -103,64 +187,160 @@ def update_emission_params_bb_sitewise(X_bb, log_gamma, total_bb_RD, taus, \ # initialization new_p_binom = np.ones((n_states, n_spots)) * 0.5 new_taus = copy.copy(taus) - if fix_BB_dispersion: + if fix_BB_dispersion: for s in np.arange(X_bb.shape[1]): - idx_nonzero = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero = np.where(total_bb_RD[:, s] > 0)[0] for i in range(n_states): - model = Weighted_BetaBinom_fixdispersion(np.append(X_bb[idx_nonzero,s], total_bb_RD[idx_nonzero,s]-X_bb[idx_nonzero,s]), \ - np.ones(2*len(idx_nonzero)).reshape(-1,1), \ - taus[i,s], \ - weights=np.append(gamma[i,idx_nonzero], gamma[i+n_states,idx_nonzero]), \ - exposure=np.append(total_bb_RD[idx_nonzero,s], total_bb_RD[idx_nonzero,s]) ) + model = Weighted_BetaBinom_fixdispersion( + np.append( + X_bb[idx_nonzero, s], + total_bb_RD[idx_nonzero, s] - X_bb[idx_nonzero, s], + ), + np.ones(2 * len(idx_nonzero)).reshape(-1, 1), + taus[i, s], + weights=np.append( + gamma[i, idx_nonzero], gamma[i + n_states, idx_nonzero] + ), + exposure=np.append( + total_bb_RD[idx_nonzero, s], total_bb_RD[idx_nonzero, s] + ), + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array(start_p_binom[i, s]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) else: if not shared_BB_dispersion: for s in np.arange(X_bb.shape[1]): - idx_nonzero = np.where(total_bb_RD[:,s] > 0)[0] + idx_nonzero = np.where(total_bb_RD[:, s] > 0)[0] for i in range(n_states): - model = Weighted_BetaBinom(np.append(X_bb[idx_nonzero,s], total_bb_RD[idx_nonzero,s]-X_bb[idx_nonzero,s]), \ - np.ones(2*len(idx_nonzero)).reshape(-1,1), \ - weights=np.append(gamma[i,idx_nonzero], gamma[i+n_states,idx_nonzero]), \ - exposure=np.append(total_bb_RD[idx_nonzero,s], total_bb_RD[idx_nonzero,s]) ) + model = Weighted_BetaBinom( + np.append( + X_bb[idx_nonzero, s], + total_bb_RD[idx_nonzero, s] - X_bb[idx_nonzero, s], + ), + np.ones(2 * len(idx_nonzero)).reshape(-1, 1), + weights=np.append( + gamma[i, idx_nonzero], gamma[i + n_states, idx_nonzero] + ), + exposure=np.append( + total_bb_RD[idx_nonzero, s], total_bb_RD[idx_nonzero, s] + ), + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] new_taus[i, s] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_taus[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: for s in np.arange(X_bb.shape[1]): - idx_nonzero = np.where(total_bb_RD[:,s] > 0)[0] - all_states_exposure = np.tile( np.append(total_bb_RD[idx_nonzero,s], total_bb_RD[idx_nonzero,s]), n_states) - all_states_y = np.tile( np.append(X_bb[idx_nonzero,s], total_bb_RD[idx_nonzero,s]-X_bb[idx_nonzero,s]), n_states) - all_states_weights = np.concatenate([ np.append(gamma[i,idx_nonzero], gamma[i+n_states,idx_nonzero]) for i in range(n_states) ]) - all_states_features = np.zeros((2*n_states*len(idx_nonzero), n_states)) + idx_nonzero = np.where(total_bb_RD[:, s] > 0)[0] + all_states_exposure = np.tile( + np.append(total_bb_RD[idx_nonzero, s], total_bb_RD[idx_nonzero, s]), + n_states, + ) + all_states_y = np.tile( + np.append( + X_bb[idx_nonzero, s], + total_bb_RD[idx_nonzero, s] - X_bb[idx_nonzero, s], + ), + n_states, + ) + all_states_weights = np.concatenate( + [ + np.append( + gamma[i, idx_nonzero], gamma[i + n_states, idx_nonzero] + ) + for i in range(n_states) + ] + ) + all_states_features = np.zeros( + (2 * n_states * len(idx_nonzero), n_states) + ) for i in np.arange(n_states): - all_states_features[(i*2*len(idx_nonzero)):((i+1)*2*len(idx_nonzero)), i] = 1 - model = Weighted_BetaBinom(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_exposure) + all_states_features[ + (i * 2 * len(idx_nonzero)) : ((i + 1) * 2 * len(idx_nonzero)), i + ] = 1 + model = Weighted_BetaBinom( + all_states_y, + all_states_features, + weights=all_states_weights, + exposure=all_states_exposure, + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - new_p_binom[:,s] = res.params[:-1] - new_p_binom[new_p_binom[:,s] < min_binom_prob, s] = min_binom_prob - new_p_binom[new_p_binom[:,s] > max_binom_prob, s] = max_binom_prob + new_p_binom[:, s] = res.params[:-1] + new_p_binom[new_p_binom[:, s] < min_binom_prob, s] = min_binom_prob + new_p_binom[new_p_binom[:, s] > max_binom_prob, s] = max_binom_prob if res.params[-1] > 0: new_taus[:, s] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append(start_p_binom[:,s], [taus[0, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[:,s] = res.params[:-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[:-1] - new_p_binom[new_p_binom[:,s] < min_binom_prob, s] = min_binom_prob - new_p_binom[new_p_binom[:,s] > max_binom_prob, s] = max_binom_prob + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append(start_p_binom[:, s], [taus[0, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[:, s] = ( + res.params[:-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[:-1] + ) + new_p_binom[new_p_binom[:, s] < min_binom_prob, s] = min_binom_prob + new_p_binom[new_p_binom[:, s] > max_binom_prob, s] = max_binom_prob if res2.params[-1] > 0: - new_taus[:,s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + new_taus[:, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) return new_p_binom, new_taus - -def hmrf_log_likelihood(nodepotential, single_X, single_base_nb_mean, single_total_bb_RD, res, pred, smooth_mat, adjacency_mat, assignment, spatial_weight): +def hmrf_log_likelihood( + nodepotential, + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + pred, + smooth_mat, + adjacency_mat, + assignment, + spatial_weight, +): N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_p_binom"].shape[1] @@ -168,40 +348,98 @@ def hmrf_log_likelihood(nodepotential, single_X, single_base_nb_mean, single_tot single_llf = np.zeros((N, n_clones)) # for i in trange(N): - idx = smooth_mat[i,:].nonzero()[1] # smooth_mat can be identity matrix + idx = smooth_mat[i, :].nonzero()[1] # smooth_mat can be identity matrix for c in range(n_clones): - tmp_log_emission_rdr, tmp_log_emission_baf = compute_emission_probability_nb_betabinom_phaseswitch( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \ - np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"][:,c:(c+1)], res["new_alphas"][:,c:(c+1)], \ - np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"][:,c:(c+1)], res["new_taus"][:,c:(c+1)]) + tmp_log_emission_rdr, tmp_log_emission_baf = ( + compute_emission_probability_nb_betabinom_phaseswitch( + np.sum(single_X[:, :, idx], axis=2, keepdims=True), + np.sum(single_base_nb_mean[:, idx], axis=1, keepdims=True), + res["new_log_mu"][:, c : (c + 1)], + res["new_alphas"][:, c : (c + 1)], + np.sum(single_total_bb_RD[:, idx], axis=1, keepdims=True), + res["new_p_binom"][:, c : (c + 1)], + res["new_taus"][:, c : (c + 1)], + ) + ) # if nodepotential == "weighted_sum": - if np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) > 0 and np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:, 0] + res["log_gamma"][:,:,c], axis=0) ) + np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:, 0] + res["log_gamma"][:,:,c], axis=0) ) + if ( + np.sum(np.sum(single_base_nb_mean[:, idx], axis=1) > 0) > 0 + and np.sum(np.sum(single_total_bb_RD[:, idx], axis=1) > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(np.sum(single_total_bb_RD[:, idx], axis=1) > 0) + / np.sum(np.sum(single_base_nb_mean[:, idx], axis=1) > 0) + ) + single_llf[i, c] = ratio_nonzeros * np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) else: - single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:,:,0] + res["log_gamma"][:,:,c], axis=0) ) + single_llf[i, c] = np.sum( + scipy.special.logsumexp( + tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) + np.sum( + scipy.special.logsumexp( + tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, :, c], + axis=0, + ) + ) else: - if np.sum(single_base_nb_mean[:,idx] > 0) > 0 and np.sum(single_total_bb_RD[:,idx] > 0) > 0: - ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) - single_llf[i,c] = ratio_nonzeros * np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) + if ( + np.sum(single_base_nb_mean[:, idx] > 0) > 0 + and np.sum(single_total_bb_RD[:, idx] > 0) > 0 + ): + ratio_nonzeros = ( + 1.0 + * np.sum(np.sum(single_total_bb_RD[:, idx], axis=1) > 0) + / np.sum(np.sum(single_base_nb_mean[:, idx], axis=1) > 0) + ) + single_llf[i, c] = ratio_nonzeros * np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]) else: - single_llf[i,c] = np.sum(tmp_log_emission_rdr[pred[:,c], np.arange(n_obs), 0]) + np.sum(tmp_log_emission_baf[pred[:,c], np.arange(n_obs), 0]) + single_llf[i, c] = np.sum( + tmp_log_emission_rdr[pred[:, c], np.arange(n_obs), 0] + ) + np.sum(tmp_log_emission_baf[pred[:, c], np.arange(n_obs), 0]) # # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(assignment[adjacency_mat[i,:].nonzero()[1]] == assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum(assignment[adjacency_mat[i, :].nonzero()[1]] == assignment[i]) + ) return total_llf -def hmrf_reassignment_compositehmm(single_X, single_base_nb_mean, single_total_bb_RD, res, pred, adjacency_mat, prev_assignment, spatial_weight): +def hmrf_reassignment_compositehmm( + single_X, + single_base_nb_mean, + single_total_bb_RD, + res, + pred, + adjacency_mat, + prev_assignment, + spatial_weight, +): # basic dimension info N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = np.max(prev_assignment) + 1 n_individual_states = int(len(res["new_p_binom"]) / 2.0) n_composite_states = int(len(res["state_tuples"]) / 2.0) - + # initialize result vector single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) @@ -209,153 +447,350 @@ def hmrf_reassignment_compositehmm(single_X, single_base_nb_mean, single_total_b # re-assign by HMRF for i in trange(N): # log emission probability of each composite state, matrix size (2*n_composite_states, n_obs) - tmp_log_emission = compute_emission_probability_nb_betabinom_composite(single_X[:,:,i:(i+1)], res["state_tuples"], \ - single_base_nb_mean[:,i:(i+1)], res["new_log_mu"], res["new_alphas"], single_total_bb_RD[:,i:(i+1)], \ - res["new_p_binom"], res["new_taus"], res["new_scalefactors"]) + tmp_log_emission = compute_emission_probability_nb_betabinom_composite( + single_X[:, :, i : (i + 1)], + res["state_tuples"], + single_base_nb_mean[:, i : (i + 1)], + res["new_log_mu"], + res["new_alphas"], + single_total_bb_RD[:, i : (i + 1)], + res["new_p_binom"], + res["new_taus"], + res["new_scalefactors"], + ) for c in range(n_clones): - single_llf[i,c] = np.sum(tmp_log_emission[pred[(c*n_obs):(c*n_obs+n_obs)], np.arange(n_obs)]) + single_llf[i, c] = np.sum( + tmp_log_emission[ + pred[(c * n_obs) : (c * n_obs + n_obs)], np.arange(n_obs) + ] + ) # node potential - w_node = single_llf[i,:] + w_node = single_llf[i, :] # edge potential w_edge = np.zeros(n_clones) - for j in adjacency_mat[i,:].nonzero()[1]: + for j in adjacency_mat[i, :].nonzero()[1]: # w_edge[new_assignment[j]] += 1 - w_edge[new_assignment[j]] += adjacency_mat[i,j] + w_edge[new_assignment[j]] += adjacency_mat[i, j] # combine both potential for the new assignment - new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge ) - + new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) + # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) for i in range(N): - total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) ) + total_llf += np.sum( + spatial_weight + * np.sum( + new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] + ) + ) return new_assignment, single_llf, total_llf - def allele_starch_combine_clones(): - res_combine = {"new_assignment":np.zeros(single_X.shape[2], dtype=int)} + res_combine = {"new_assignment": np.zeros(single_X.shape[2], dtype=int)} offset_clone = 0 for bafc in range(n_baf_clones): prefix = f"clone{bafc}" - allres = dict( np.load(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", allow_pickle=True) ) + allres = dict( + np.load( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) r = allres["num_iterations"] - 1 - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} - idx_spots = np.where(adata.obs.index.isin( allres["barcodes"] ))[0] + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } + idx_spots = np.where(adata.obs.index.isin(allres["barcodes"]))[0] n_obs = single_X.shape[0] if len(np.unique(res["new_assignment"])) == 1: n_merged_clones = 1 c = res["new_assignment"][0] merged_res = copy.copy(res) merged_res["new_assignment"] = np.zeros(len(idx_spots), dtype=int) - log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)].reshape((2*config["n_states"], n_obs, 1)) - pred_cnv = res["pred_cnv"][ (c*n_obs):(c*n_obs+n_obs) ].reshape((-1,1)) + log_gamma = res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)].reshape( + (2 * config["n_states"], n_obs, 1) + ) + pred_cnv = res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)].reshape( + (-1, 1) + ) else: if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(n_clones_rdr)]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(res["new_assignment"] == c)[0] + for c in range(n_clones_rdr) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(res["new_assignment"]==c)[0] for c in range(n_clones_rdr)], single_tumor_prop[idx_spots]) - merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res, params="smp", tumor_prop=tumor_prop) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(res["new_assignment"] == c)[0] + for c in range(n_clones_rdr) + ], + single_tumor_prop[idx_spots], + ) + ) + merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson( + X, base_nb_mean, total_bb_RD, res, params="smp", tumor_prop=tumor_prop + ) print(f"part {bafc} merging_groups: {merging_groups}") # if config["tumorprop_file"] is None: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], res, min_spots_thresholds=50) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], res, min_spots_thresholds=50 + ) else: - merging_groups, merged_res = merge_by_minspots(merged_res["new_assignment"], res, min_spots_thresholds=50, single_tumor_prop=single_tumor_prop[idx_spots]) + merging_groups, merged_res = merge_by_minspots( + merged_res["new_assignment"], + res, + min_spots_thresholds=50, + single_tumor_prop=single_tumor_prop[idx_spots], + ) # compute posterior using the newly merged pseudobulk n_merged_clones = len(merging_groups) tmp = copy.copy(merged_res["new_assignment"]) if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(merged_res["new_assignment"] == c)[0] + for c in range(n_merged_clones) + ], + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X[:,:,idx_spots], single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], [np.where(merged_res["new_assignment"]==c)[0] for c in range(n_merged_clones)], single_tumor_prop[idx_spots]) + X, base_nb_mean, total_bb_RD, tumor_prop = ( + merge_pseudobulk_by_index_mix( + single_X[:, :, idx_spots], + single_base_nb_mean[:, idx_spots], + single_total_bb_RD[:, idx_spots], + [ + np.where(merged_res["new_assignment"] == c)[0] + for c in range(n_merged_clones) + ], + single_tumor_prop[idx_spots], + ) + ) # - merged_res = pipeline_baum_welch(None, np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), np.tile(lengths, X.shape[2]), config["n_states"], \ - base_nb_mean.flatten("F").reshape(-1,1), total_bb_RD.flatten("F").reshape(-1,1), np.tile(log_sitewise_transmat, X.shape[2]), tumor_prop, params="smp", t=config["t"], random_state=config["gmm_random_state"], \ - fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ - is_diag=True, init_log_mu=res["new_log_mu"], init_p_binom=res["new_p_binom"], init_alphas=res["new_alphas"], init_taus=res["new_taus"], max_iter=config["max_iter"], tol=config["tol"]) + merged_res = pipeline_baum_welch( + None, + np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( + -1, 2, 1 + ), + np.tile(lengths, X.shape[2]), + config["n_states"], + base_nb_mean.flatten("F").reshape(-1, 1), + total_bb_RD.flatten("F").reshape(-1, 1), + np.tile(log_sitewise_transmat, X.shape[2]), + tumor_prop, + params="smp", + t=config["t"], + random_state=config["gmm_random_state"], + fix_NB_dispersion=config["fix_NB_dispersion"], + shared_NB_dispersion=config["shared_NB_dispersion"], + fix_BB_dispersion=config["fix_BB_dispersion"], + shared_BB_dispersion=config["shared_BB_dispersion"], + is_diag=True, + init_log_mu=res["new_log_mu"], + init_p_binom=res["new_p_binom"], + init_alphas=res["new_alphas"], + init_taus=res["new_taus"], + max_iter=config["max_iter"], + tol=config["tol"], + ) merged_res["new_assignment"] = copy.copy(tmp) - log_gamma = np.stack([ merged_res["log_gamma"][:,(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ], axis=-1) - pred_cnv = np.vstack([ merged_res["pred_cnv"][(c*n_obs):(c*n_obs+n_obs)] for c in range(n_merged_clones) ]).T + log_gamma = np.stack( + [ + merged_res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)] + for c in range(n_merged_clones) + ], + axis=-1, + ) + pred_cnv = np.vstack( + [ + merged_res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] + for c in range(n_merged_clones) + ] + ).T # add to res_combine if len(res_combine) == 1: - res_combine.update({"new_log_mu":np.hstack([ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([ merged_res["new_alphas"] ] * n_merged_clones), \ - "new_p_binom":np.hstack([ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([ merged_res["new_taus"] ] * n_merged_clones), \ - "log_gamma":log_gamma, "pred_cnv":pred_cnv}) + res_combine.update( + { + "new_log_mu": np.hstack( + [merged_res["new_log_mu"]] * n_merged_clones + ), + "new_alphas": np.hstack( + [merged_res["new_alphas"]] * n_merged_clones + ), + "new_p_binom": np.hstack( + [merged_res["new_p_binom"]] * n_merged_clones + ), + "new_taus": np.hstack([merged_res["new_taus"]] * n_merged_clones), + "log_gamma": log_gamma, + "pred_cnv": pred_cnv, + } + ) else: - res_combine.update({"new_log_mu":np.hstack([res_combine["new_log_mu"]] + [ merged_res["new_log_mu"] ] * n_merged_clones), "new_alphas":np.hstack([res_combine["new_alphas"]] + [ merged_res["new_alphas"] ] * n_merged_clones), \ - "new_p_binom":np.hstack([res_combine["new_p_binom"]] + [ merged_res["new_p_binom"] ] * n_merged_clones), "new_taus":np.hstack([res_combine["new_taus"]] + [ merged_res["new_taus"] ] * n_merged_clones), \ - "log_gamma":np.dstack([res_combine["log_gamma"], log_gamma ]), "pred_cnv":np.hstack([res_combine["pred_cnv"], pred_cnv])}) - res_combine["new_assignment"][idx_spots] = merged_res["new_assignment"] + offset_clone + res_combine.update( + { + "new_log_mu": np.hstack( + [res_combine["new_log_mu"]] + + [merged_res["new_log_mu"]] * n_merged_clones + ), + "new_alphas": np.hstack( + [res_combine["new_alphas"]] + + [merged_res["new_alphas"]] * n_merged_clones + ), + "new_p_binom": np.hstack( + [res_combine["new_p_binom"]] + + [merged_res["new_p_binom"]] * n_merged_clones + ), + "new_taus": np.hstack( + [res_combine["new_taus"]] + + [merged_res["new_taus"]] * n_merged_clones + ), + "log_gamma": np.dstack([res_combine["log_gamma"], log_gamma]), + "pred_cnv": np.hstack([res_combine["pred_cnv"], pred_cnv]), + } + ) + res_combine["new_assignment"][idx_spots] = ( + merged_res["new_assignment"] + offset_clone + ) offset_clone += n_merged_clones # compute HMRF log likelihood - total_llf = hmrf_log_likelihood(config["nodepotential"], single_X, single_base_nb_mean, single_total_bb_RD, res_combine, np.argmax(res_combine["log_gamma"],axis=0), smooth_mat, adjacency_mat, res_combine["new_assignment"], config["spatial_weight"]) + total_llf = hmrf_log_likelihood( + config["nodepotential"], + single_X, + single_base_nb_mean, + single_total_bb_RD, + res_combine, + np.argmax(res_combine["log_gamma"], axis=0), + smooth_mat, + adjacency_mat, + res_combine["new_assignment"], + config["spatial_weight"], + ) res_combine["total_llf"] = total_llf # save results - np.savez(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine) - + np.savez( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine + ) def simplify_parameters(res, params="smp", bafthreshold=0.05, rdrthreshold=0.1): n_states = res["new_p_binom"].shape[0] G = nx.Graph() - G.add_nodes_from( np.arange(n_states) ) - mAF = np.where(res["new_p_binom"].flatten() < 0.5, res["new_p_binom"].flatten(), 1-res["new_p_binom"].flatten()) + G.add_nodes_from(np.arange(n_states)) + mAF = np.where( + res["new_p_binom"].flatten() < 0.5, + res["new_p_binom"].flatten(), + 1 - res["new_p_binom"].flatten(), + ) if "m" in params and "p" in params: - tmp_edge_graph = (np.abs( res["new_log_mu"].flatten().reshape(-1,1) - res["new_log_mu"].flatten().reshape(1,-1) ) < rdrthreshold) & (np.abs( mAF.reshape(-1,1) - mAF.reshape(1,-1) ) < bafthreshold) + tmp_edge_graph = ( + np.abs( + res["new_log_mu"].flatten().reshape(-1, 1) + - res["new_log_mu"].flatten().reshape(1, -1) + ) + < rdrthreshold + ) & (np.abs(mAF.reshape(-1, 1) - mAF.reshape(1, -1)) < bafthreshold) elif "m" in params: - tmp_edge_graph = (np.abs( res["new_log_mu"].flatten().reshape(-1,1) - res["new_log_mu"].flatten().reshape(1,-1) ) < rdrthreshold) + tmp_edge_graph = ( + np.abs( + res["new_log_mu"].flatten().reshape(-1, 1) + - res["new_log_mu"].flatten().reshape(1, -1) + ) + < rdrthreshold + ) else: - tmp_edge_graph = (np.abs( mAF.reshape(-1,1) - mAF.reshape(1,-1) ) < bafthreshold) - G.add_edges_from([ (i,j) for i in range(tmp_edge_graph.shape[0]) for j in range(tmp_edge_graph.shape[1]) if tmp_edge_graph[i,j] ]) + tmp_edge_graph = np.abs(mAF.reshape(-1, 1) - mAF.reshape(1, -1)) < bafthreshold + G.add_edges_from( + [ + (i, j) + for i in range(tmp_edge_graph.shape[0]) + for j in range(tmp_edge_graph.shape[1]) + if tmp_edge_graph[i, j] + ] + ) # maximal cliques cliques = [] for x in nx.find_cliques(G): this_len = len(x) - cliques.append( (x, this_len) ) - cliques.sort(key = lambda x:(-x[1]) ) + cliques.append((x, this_len)) + cliques.sort(key=lambda x: (-x[1])) covered_states = set() merging_state_groups = [] for c in cliques: if len(set(c[0]) & covered_states) == 0: - merging_state_groups.append( list(c[0]) ) + merging_state_groups.append(list(c[0])) covered_states = covered_states | set(c[0]) for c in range(n_states): if not (c in covered_states): - merging_state_groups.append( [c] ) + merging_state_groups.append([c]) covered_states.add(c) - merging_state_groups.sort(key = lambda x:np.min(x)) + merging_state_groups.sort(key=lambda x: np.min(x)) # merged parameters - simplied_res = {"new_log_mu":np.array([ np.mean(res["new_log_mu"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1), \ - "new_p_binom":np.array([ np.mean(res["new_p_binom"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1), \ - "new_alphas":np.array([ np.mean(res["new_alphas"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1), \ - "new_taus":np.array([ np.mean(res["new_taus"].flatten()[idx]) for idx in merging_state_groups]).reshape(-1,1)} + simplied_res = { + "new_log_mu": np.array( + [np.mean(res["new_log_mu"].flatten()[idx]) for idx in merging_state_groups] + ).reshape(-1, 1), + "new_p_binom": np.array( + [np.mean(res["new_p_binom"].flatten()[idx]) for idx in merging_state_groups] + ).reshape(-1, 1), + "new_alphas": np.array( + [np.mean(res["new_alphas"].flatten()[idx]) for idx in merging_state_groups] + ).reshape(-1, 1), + "new_taus": np.array( + [np.mean(res["new_taus"].flatten()[idx]) for idx in merging_state_groups] + ).reshape(-1, 1), + } return simplied_res def similarity_components_baf(baf_profiles, res, topk=10, threshold=0.05): n_clones = baf_profiles.shape[0] - adj_baf_profiles = np.where(baf_profiles > 0.5, 1-baf_profiles, baf_profiles) + adj_baf_profiles = np.where(baf_profiles > 0.5, 1 - baf_profiles, baf_profiles) G = nx.Graph() - G.add_nodes_from( np.arange(n_clones) ) + G.add_nodes_from(np.arange(n_clones)) for c1 in range(n_clones): - for c2 in range(c1+1, n_clones): - diff = np.sort(np.abs(baf_profiles[c1,:] - baf_profiles[c2,:]))[::-1][topk] - adj_diff = np.sort(np.abs(adj_baf_profiles[c1,:] - adj_baf_profiles[c2,:]))[::-1][topk] - if diff < 2*threshold and adj_diff < threshold: + for c2 in range(c1 + 1, n_clones): + diff = np.sort(np.abs(baf_profiles[c1, :] - baf_profiles[c2, :]))[::-1][ + topk + ] + adj_diff = np.sort( + np.abs(adj_baf_profiles[c1, :] - adj_baf_profiles[c2, :]) + )[::-1][topk] + if diff < 2 * threshold and adj_diff < threshold: G.add_edge(c1, c2) print(c1, c2, diff) merging_groups = [cc for cc in nx.connected_components(G)] - merging_groups.sort(key = lambda x:np.min(x)) + merging_groups.sort(key=lambda x: np.min(x)) # clone assignment after merging map_clone_id = {} - for i,x in enumerate(merging_groups): + for i, x in enumerate(merging_groups): for z in x: map_clone_id[z] = i new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) @@ -365,24 +800,36 @@ def similarity_components_baf(baf_profiles, res, topk=10, threshold=0.05): return merging_groups, merged_res -def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.1): -# def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.15): +def similarity_components_rdrbaf( + baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.1 +): + # def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafthreshold=0.05, rdrthreshold=0.15): n_clones = baf_profiles.shape[0] - adj_baf_profiles = np.where(baf_profiles > 0.5, 1-baf_profiles, baf_profiles) + adj_baf_profiles = np.where(baf_profiles > 0.5, 1 - baf_profiles, baf_profiles) G = nx.Graph() - G.add_nodes_from( np.arange(n_clones) ) + G.add_nodes_from(np.arange(n_clones)) for c1 in range(n_clones): - for c2 in range(c1+1, n_clones): - baf_diff = np.sort(np.abs(baf_profiles[c1,:] - baf_profiles[c2,:]))[::-1][topk] - baf_adj_diff = np.sort(np.abs(adj_baf_profiles[c1,:] - adj_baf_profiles[c2,:]))[::-1][topk] - rdr_diff = np.sort(np.abs(rdr_profiles[c1,:] - rdr_profiles[c2,:]))[::-1][topk] - if baf_diff < 2*bafthreshold and baf_adj_diff < bafthreshold and rdr_diff < rdrthreshold: + for c2 in range(c1 + 1, n_clones): + baf_diff = np.sort(np.abs(baf_profiles[c1, :] - baf_profiles[c2, :]))[::-1][ + topk + ] + baf_adj_diff = np.sort( + np.abs(adj_baf_profiles[c1, :] - adj_baf_profiles[c2, :]) + )[::-1][topk] + rdr_diff = np.sort(np.abs(rdr_profiles[c1, :] - rdr_profiles[c2, :]))[::-1][ + topk + ] + if ( + baf_diff < 2 * bafthreshold + and baf_adj_diff < bafthreshold + and rdr_diff < rdrthreshold + ): G.add_edge(c1, c2) merging_groups = [cc for cc in nx.connected_components(G)] - merging_groups.sort(key = lambda x:np.min(x)) + merging_groups.sort(key=lambda x: np.min(x)) # clone assignment after merging map_clone_id = {} - for i,x in enumerate(merging_groups): + for i, x in enumerate(merging_groups): for z in x: map_clone_id[z] = i new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) @@ -392,81 +839,166 @@ def similarity_components_rdrbaf(baf_profiles, rdr_profiles, res, topk=10, bafth return merging_groups, merged_res -def initialization_rdr_bybaf(n_states, X, base_nb_mean, total_bb_RD, params, prior_p_binom, random_state=None, in_log_space=True): - tmp_log_mu, tmp_p_binom = initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=random_state, in_log_space=in_log_space, min_binom_prob=0, max_binom_prob=1) +def initialization_rdr_bybaf( + n_states, + X, + base_nb_mean, + total_bb_RD, + params, + prior_p_binom, + random_state=None, + in_log_space=True, +): + tmp_log_mu, tmp_p_binom = initialization_by_gmm( + n_states, + X, + base_nb_mean, + total_bb_RD, + params, + random_state=random_state, + in_log_space=in_log_space, + min_binom_prob=0, + max_binom_prob=1, + ) prior_log_mu = np.zeros(prior_p_binom.shape) - for i,x in enumerate(prior_p_binom): - idx_nearest = np.argmin( scipy.spatial.distance.cdist(x.reshape(-1,1), tmp_p_binom) ) + for i, x in enumerate(prior_p_binom): + idx_nearest = np.argmin( + scipy.spatial.distance.cdist(x.reshape(-1, 1), tmp_p_binom) + ) prior_log_mu[i] = tmp_log_mu[idx_nearest] return prior_log_mu - def output_integer_CN(): ##### infer integer copy ##### - res_combine = dict(np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True)) + res_combine = dict( + np.load( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) n_final_clone = len(np.unique(res_combine["new_assignment"])) medfix = ["", "_diploid", "_triploid", "_tetraploid"] - for o,max_medploidy in enumerate([None, 2, 3, 4]): + for o, max_medploidy in enumerate([None, 2, 3, 4]): # A/B copy number per bin A_copy = np.zeros((n_final_clone, n_obs), dtype=int) B_copy = np.zeros((n_final_clone, n_obs), dtype=int) # A/B copy number per state - state_A_copy = np.zeros((n_final_clone, config['n_states']), dtype=int) - state_B_copy = np.zeros((n_final_clone, config['n_states']), dtype=int) + state_A_copy = np.zeros((n_final_clone, config["n_states"]), dtype=int) + state_B_copy = np.zeros((n_final_clone, config["n_states"]), dtype=int) df_genelevel_cnv = None if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==c)[0] for c in range(n_final_clone)]) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res_combine["new_assignment"] == c)[0] + for c in range(n_final_clone) + ], + ) else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, [np.where(res_combine["new_assignment"]==c)[0] for c in range(n_final_clone)], single_tumor_prop) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + [ + np.where(res_combine["new_assignment"] == c)[0] + for c in range(n_final_clone) + ], + single_tumor_prop, + ) for s in range(n_final_clone): # adjust log_mu such that sum_bin lambda * np.exp(log_mu) = 1 - lambd = base_nb_mean[:,s] / np.sum(base_nb_mean[:,s]) - this_pred_cnv = res_combine["pred_cnv"][:,s] - adjusted_log_mu = np.log( np.exp(res_combine["new_log_mu"][:,s]) / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv,s]) * lambd) ) + lambd = base_nb_mean[:, s] / np.sum(base_nb_mean[:, s]) + this_pred_cnv = res_combine["pred_cnv"][:, s] + adjusted_log_mu = np.log( + np.exp(res_combine["new_log_mu"][:, s]) + / np.sum(np.exp(res_combine["new_log_mu"][this_pred_cnv, s]) * lambd) + ) if not max_medploidy is None: - best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv, max_medploidy=max_medploidy) + best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + max_medploidy=max_medploidy, + ) else: - best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone(adjusted_log_mu, base_nb_mean[:,s], res_combine["new_p_binom"][:,s], this_pred_cnv) - print(f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}") - - A_copy[s,:] = best_integer_copies[res_combine["pred_cnv"][:,s], 0] - B_copy[s,:] = best_integer_copies[res_combine["pred_cnv"][:,s], 1] - state_A_copy[s,:] = best_integer_copies[:,0] - state_B_copy[s,:] = best_integer_copies[:,1] - tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list) + best_integer_copies, _ = hill_climbing_integer_copynumber_oneclone( + adjusted_log_mu, + base_nb_mean[:, s], + res_combine["new_p_binom"][:, s], + this_pred_cnv, + ) + print( + f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}" + ) + + A_copy[s, :] = best_integer_copies[res_combine["pred_cnv"][:, s], 0] + B_copy[s, :] = best_integer_copies[res_combine["pred_cnv"][:, s], 1] + state_A_copy[s, :] = best_integer_copies[:, 0] + state_B_copy[s, :] = best_integer_copies[:, 1] + tmpdf = get_genelevel_cnv_oneclone( + best_integer_copies[res_combine["pred_cnv"][:, s], 0], + best_integer_copies[res_combine["pred_cnv"][:, s], 1], + x_gene_list, + ) tmpdf.columns = [f"clone{s} A", f"clone{s} B"] if df_genelevel_cnv is None: df_genelevel_cnv = copy.copy(tmpdf) else: df_genelevel_cnv = df_genelevel_cnv.join(tmpdf) # output gene-level copy number - df_genelevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t") + df_genelevel_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t" + ) # output segment-level copy number - df_seglevel_cnv = pd.DataFrame({"CHR":[x[0] for x in sorted_chr_pos], "START":[x[1] for x in sorted_chr_pos], \ - "END":[ (sorted_chr_pos[i+1][1] if i+1 < len(sorted_chr_pos) and x[0]==sorted_chr_pos[i+1][0] else -1) for i,x in enumerate(sorted_chr_pos)] }) + df_seglevel_cnv = pd.DataFrame( + { + "CHR": [x[0] for x in sorted_chr_pos], + "START": [x[1] for x in sorted_chr_pos], + "END": [ + ( + sorted_chr_pos[i + 1][1] + if i + 1 < len(sorted_chr_pos) + and x[0] == sorted_chr_pos[i + 1][0] + else -1 + ) + for i, x in enumerate(sorted_chr_pos) + ], + } + ) for s in range(n_final_clone): - df_seglevel_cnv[f"clone{s} A"] = A_copy[s,:] - df_seglevel_cnv[f"clone{s} B"] = B_copy[s,:] - df_seglevel_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t") + df_seglevel_cnv[f"clone{s} A"] = A_copy[s, :] + df_seglevel_cnv[f"clone{s} B"] = B_copy[s, :] + df_seglevel_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_seglevel.tsv", header=True, index=False, sep="\t" + ) # output per-state copy number df_state_cnv = {} for s in range(n_final_clone): - df_state_cnv[f"clone{s} logmu"] = res_combine["new_log_mu"][:,s] - df_state_cnv[f"clone{s} p"] = res_combine["new_p_binom"][:,s] - df_state_cnv[f"clone{s} A"] = state_A_copy[s,:] - df_state_cnv[f"clone{s} B"] = state_B_copy[s,:] + df_state_cnv[f"clone{s} logmu"] = res_combine["new_log_mu"][:, s] + df_state_cnv[f"clone{s} p"] = res_combine["new_p_binom"][:, s] + df_state_cnv[f"clone{s} A"] = state_A_copy[s, :] + df_state_cnv[f"clone{s} B"] = state_B_copy[s, :] df_state_cnv = pd.DataFrame.from_dict(df_state_cnv) - df_state_cnv.to_csv(f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t") - + df_state_cnv.to_csv( + f"{outdir}/cnv{medfix[o]}_perstate.tsv", header=True, index=False, sep="\t" + ) + ##### output clone label ##### adata.obs["clone_label"] = res_combine["new_assignment"] if config["tumorprop_file"] is None: - adata.obs[["clone_label"]].to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t") + adata.obs[["clone_label"]].to_csv( + f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t" + ) else: - adata.obs[["tumor_proportion", "clone_label"]].to_csv(f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t") + adata.obs[["tumor_proportion", "clone_label"]].to_csv( + f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t" + ) def set_bin_exp_to_zero(): @@ -475,12 +1007,18 @@ def set_bin_exp_to_zero(): N_STEP = 2 multi_step_smooth = copy.copy(smooth_mat) for _ in range(N_STEP): - multi_step_smooth = (multi_step_smooth + multi_step_smooth @ smooth_mat) + multi_step_smooth = multi_step_smooth + multi_step_smooth @ smooth_mat multi_step_smooth = (multi_step_smooth > 0).astype(int) - rdr = (copy_single_X_rdr @ multi_step_smooth) / (copy_single_base_nb_mean @ multi_step_smooth) - rdr[np.sum(copy_single_base_nb_mean,axis=1) == 0] = 0 - bidx_inconfident = np.where(~np.all(rdr <= MAX_RDR, axis=1))[0] + rdr = (copy_single_X_rdr @ multi_step_smooth) / ( + copy_single_base_nb_mean @ multi_step_smooth + ) + rdr[np.sum(copy_single_base_nb_mean, axis=1) == 0] = 0 + bidx_inconfident = np.where(~np.all(rdr <= MAX_RDR, axis=1))[0] rdr_normal[bidx_inconfident] = 0 rdr_normal = rdr_normal / np.sum(rdr_normal) - copy_single_X_rdr[bidx_inconfident, :] = 0 # avoid ill-defined distributions if normal has 0 count in that bin. - copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1) + copy_single_X_rdr[bidx_inconfident, :] = ( + 0 # avoid ill-defined distributions if normal has 0 count in that bin. + ) + copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum( + copy_single_X_rdr, axis=0 + ).reshape(1, -1) diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py index 9bdd862..2585923 100644 --- a/src/calicost/parse_input.py +++ b/src/calicost/parse_input.py @@ -7,7 +7,12 @@ import scanpy as sc import anndata import logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) logger = logging.getLogger() import copy from pathlib import Path @@ -20,21 +25,40 @@ def genesnp_to_bininfo(df_gene_snp): - table_bininfo = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"CHR":'first', 'START':'first', 'END':'last', 'gene':set, 'snp_id':set}).reset_index() - table_bininfo['ARM'] = '.' - table_bininfo['INCLUDED_GENES'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.gene.values ] - table_bininfo['INCLUDED_SNP_IDS'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ] - table_bininfo['NORMAL_COUNT'] = np.nan - table_bininfo['N_SNPS'] = [ len([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ] + table_bininfo = ( + df_gene_snp[~df_gene_snp.bin_id.isnull()] + .groupby("bin_id") + .agg( + { + "CHR": "first", + "START": "first", + "END": "last", + "gene": set, + "snp_id": set, + } + ) + .reset_index() + ) + table_bininfo["ARM"] = "." + table_bininfo["INCLUDED_GENES"] = [ + " ".join([x for x in y if not x is None]) for y in table_bininfo.gene.values + ] + table_bininfo["INCLUDED_SNP_IDS"] = [ + " ".join([x for x in y if not x is None]) for y in table_bininfo.snp_id.values + ] + table_bininfo["NORMAL_COUNT"] = np.nan + table_bininfo["N_SNPS"] = [ + len([x for x in y if not x is None]) for y in table_bininfo.snp_id.values + ] # drop the set columns - table_bininfo.drop(columns=['gene', 'snp_id'], inplace=True) + table_bininfo.drop(columns=["gene", "snp_id"], inplace=True) return table_bininfo def parse_visium(config): """ Read multiple 10X Visium SRT samples and SNP data and generate tables with counts and meta info. - + Attributes: ---------- config : dictionary @@ -61,18 +85,41 @@ def parse_visium(config): KNN smoothing matrix. """ if "input_filelist" in config: - adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat = load_joint_data(config["input_filelist"], config["snp_dir"], config["alignment_files"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) + ( + adata, + cell_snp_Aallele, + cell_snp_Ballele, + unique_snp_ids, + across_slice_adjacency_mat, + ) = load_joint_data( + config["input_filelist"], + config["snp_dir"], + config["alignment_files"], + config["filtergenelist_file"], + config["filterregion_file"], + config["normalidx_file"], + config["min_snpumi_perspot"], + config["min_percent_expressed_spots"], + ) sample_list = [adata.obs["sample"][0]] for i in range(1, adata.shape[0]): if adata.obs["sample"][i] != sample_list[-1]: - sample_list.append( adata.obs["sample"][i] ) + sample_list.append(adata.obs["sample"][i]) # convert sample name to index sample_ids = np.zeros(adata.shape[0], dtype=int) - for s,sname in enumerate(sample_list): + for s, sname in enumerate(sample_list): index = np.where(adata.obs["sample"] == sname)[0] sample_ids[index] = s else: - adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(config["spaceranger_dir"], config["snp_dir"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) + adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data( + config["spaceranger_dir"], + config["snp_dir"], + config["filtergenelist_file"], + config["filterregion_file"], + config["normalidx_file"], + config["min_snpumi_perspot"], + config["min_percent_expressed_spots"], + ) adata.obs["sample"] = "unique_sample" sample_list = [adata.obs["sample"][0]] sample_ids = np.zeros(adata.shape[0], dtype=int) @@ -81,38 +128,108 @@ def parse_visium(config): coords = adata.obsm["X_pos"] if not config["tumorprop_file"] is None: - df_tumorprop = pd.read_csv(config["tumorprop_file"], sep="\t", header=0, index_col=0) + df_tumorprop = pd.read_csv( + config["tumorprop_file"], sep="\t", header=0, index_col=0 + ) df_tumorprop = df_tumorprop[["Tumor"]] df_tumorprop.columns = ["tumor_proportion"] adata.obs = adata.obs.join(df_tumorprop) single_tumor_prop = adata.obs["tumor_proportion"] else: single_tumor_prop = None - + # read original data - df_gene_snp = combine_gene_snps(unique_snp_ids, config['hgtable_file'], adata) - df_gene_snp = create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids) - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \ - adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + df_gene_snp = combine_gene_snps(unique_snp_ids, config["hgtable_file"], adata) + df_gene_snp = create_haplotype_block_ranges( + df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids + ) + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + ) = summarize_counts_for_blocks( + df_gene_snp, + adata, + cell_snp_Aallele, + cell_snp_Ballele, + unique_snp_ids, + nu=config["nu"], + logphase_shift=config["logphase_shift"], + geneticmap_file=config["geneticmap_file"], + ) # infer an initial phase using pseudobulk if not Path(f"{config['output_dir']}/initial_phase.npz").exists(): - initial_clone_for_phasing = perform_partition(coords, sample_ids, x_part=config["npart_phasing"], y_part=config["npart_phasing"], single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"]) - phase_indicator, refined_lengths = initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \ - "sp", config["t_phaseing"], config["gmm_random_state"], config["fix_NB_dispersion"], config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], 30, 1e-3, threshold=config["tumorprop_threshold"]) - np.savez(f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths) + initial_clone_for_phasing = perform_partition( + coords, + sample_ids, + x_part=config["npart_phasing"], + y_part=config["npart_phasing"], + single_tumor_prop=single_tumor_prop, + threshold=config["tumorprop_threshold"], + ) + phase_indicator, refined_lengths = initial_phase_given_partition( + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + initial_clone_for_phasing, + 5, + log_sitewise_transmat, + "sp", + config["t_phaseing"], + config["gmm_random_state"], + config["fix_NB_dispersion"], + config["shared_NB_dispersion"], + config["fix_BB_dispersion"], + config["shared_BB_dispersion"], + 30, + 1e-3, + threshold=config["tumorprop_threshold"], + ) + np.savez( + f"{config['output_dir']}/initial_phase.npz", + phase_indicator=phase_indicator, + refined_lengths=refined_lengths, + ) # map phase indicator to individual snps - df_gene_snp['phase'] = np.where(df_gene_snp.snp_id.isnull(), None, df_gene_snp.block_id.map({i:x for i,x in enumerate(phase_indicator)}) ) + df_gene_snp["phase"] = np.where( + df_gene_snp.snp_id.isnull(), + None, + df_gene_snp.block_id.map({i: x for i, x in enumerate(phase_indicator)}), + ) else: tmp = dict(np.load(f"{config['output_dir']}/initial_phase.npz")) - phase_indicator, refined_lengths = tmp["phase_indicator"], tmp["refined_lengths"] + phase_indicator, refined_lengths = ( + tmp["phase_indicator"], + tmp["refined_lengths"], + ) # binning - df_gene_snp = create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, config['secondary_min_umi']) - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_bins(df_gene_snp, \ - adata, single_X, single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + df_gene_snp = create_bin_ranges( + df_gene_snp, single_total_bb_RD, refined_lengths, config["secondary_min_umi"] + ) + ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + ) = summarize_counts_for_bins( + df_gene_snp, + adata, + single_X, + single_total_bb_RD, + phase_indicator, + nu=config["nu"], + logphase_shift=config["logphase_shift"], + geneticmap_file=config["geneticmap_file"], + ) # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \ # single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi) - + # # remove bins where normal spots have imbalanced SNPs # if not config["tumorprop_file"] is None: # for prop_threshold in np.arange(0, 0.6, 0.05): @@ -122,24 +239,41 @@ def parse_visium(config): # index_normal = np.where(normal_candidate)[0] # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ # single_X, single_base_nb_mean, single_total_bb_RD, config["nu"], config["logphase_shift"], index_normal, config['geneticmap_file']) - # assert np.sum(lengths) == single_X.shape[0] + # assert np.sum(lengths) == single_X.shape[0] # assert single_X.shape[0] == single_total_bb_RD.shape[0] # assert single_X.shape[0] == len(log_sitewise_transmat) # expression count dataframe - exp_counts = pd.DataFrame.sparse.from_spmatrix( scipy.sparse.csc_matrix(adata.layers["count"]), index=adata.obs.index, columns=adata.var.index) + exp_counts = pd.DataFrame.sparse.from_spmatrix( + scipy.sparse.csc_matrix(adata.layers["count"]), + index=adata.obs.index, + columns=adata.var.index, + ) # smooth and adjacency matrix for each sample - adjacency_mat, smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, - across_slice_adjacency_mat, construct_adjacency_method=config['construct_adjacency_method'], - maxspots_pooling=config['maxspots_pooling'], construct_adjacency_w=config['construct_adjacency_w']) + adjacency_mat, smooth_mat = multislice_adjacency( + sample_ids, + sample_list, + coords, + single_total_bb_RD, + exp_counts, + across_slice_adjacency_mat, + construct_adjacency_method=config["construct_adjacency_method"], + maxspots_pooling=config["maxspots_pooling"], + construct_adjacency_w=config["construct_adjacency_w"], + ) n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten()) print(f"Set up number of spots to pool in HMRF: {n_pooled}") # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data) # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization - if config["construct_adjacency_method"] == "KNN" and config["construct_adjacency_w"] == 0: - sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) ) + if ( + config["construct_adjacency_method"] == "KNN" + and config["construct_adjacency_w"] == 0 + ): + sc.pp.normalize_total( + adata, target_sum=np.median(np.sum(exp_counts.values, axis=1)) + ) sc.pp.log1p(adata) sc.tl.pca(adata) sc.pp.neighbors(adata) @@ -148,35 +282,83 @@ def parse_visium(config): # create RDR-BAF table table_bininfo = genesnp_to_bininfo(df_gene_snp) - table_bininfo['LOG_PHASE_TRANSITION'] = log_sitewise_transmat + table_bininfo["LOG_PHASE_TRANSITION"] = log_sitewise_transmat table_rdrbaf = [] for i in range(single_X.shape[2]): - table_rdrbaf.append( pd.DataFrame({"BARCODES":adata.obs.index[i], "EXP":single_X[:,0,i], "TOT":single_total_bb_RD[:,i], "B":single_X[:,1,i]}) ) + table_rdrbaf.append( + pd.DataFrame( + { + "BARCODES": adata.obs.index[i], + "EXP": single_X[:, 0, i], + "TOT": single_total_bb_RD[:, i], + "B": single_X[:, 1, i], + } + ) + ) table_rdrbaf = pd.concat(table_rdrbaf, ignore_index=True) # create meta info table # note that table_meta.BARCODES is equal to the unique ones of table_rdrbaf.BARCODES in the original order - table_meta = pd.DataFrame({"BARCODES":adata.obs.index, "SAMPLE":adata.obs["sample"], "X":coords[:,0], "Y":coords[:,1]}) + table_meta = pd.DataFrame( + { + "BARCODES": adata.obs.index, + "SAMPLE": adata.obs["sample"], + "X": coords[:, 0], + "Y": coords[:, 1], + } + ) if not single_tumor_prop is None: table_meta["TUMOR_PROPORTION"] = single_tumor_prop - - return table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp + + return ( + table_bininfo, + table_rdrbaf, + table_meta, + exp_counts, + adjacency_mat, + smooth_mat, + df_gene_snp, + ) def load_tables_to_matrices(config): """ Load tables and adjacency from parse_visium_joint or parse_visium_single, and convert to HMM input matrices. """ - table_bininfo = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=0, index_col=None, sep="\t") - table_rdrbaf = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=0, index_col=None, sep="\t") - table_meta = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=0, index_col=None, sep="\t") - adjacency_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz" ) - smooth_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz" ) + table_bininfo = pd.read_csv( + f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", + header=0, + index_col=None, + sep="\t", + ) + table_rdrbaf = pd.read_csv( + f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", + header=0, + index_col=None, + sep="\t", + ) + table_meta = pd.read_csv( + f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", + header=0, + index_col=None, + sep="\t", + ) + adjacency_mat = scipy.sparse.load_npz( + f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz" + ) + smooth_mat = scipy.sparse.load_npz( + f"{config['output_dir']}/parsed_inputs/smooth_mat.npz" + ) # - df_gene_snp = pd.read_csv(f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=0, index_col=None, sep="\t") + df_gene_snp = pd.read_csv( + f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", + header=0, + index_col=None, + sep="\t", + ) df_gene_snp = df_gene_snp.replace(np.nan, None) - + n_spots = table_meta.shape[0] n_bins = table_bininfo.shape[0] @@ -187,18 +369,26 @@ def load_tables_to_matrices(config): single_X[:, 1, :] = table_rdrbaf["B"].values.reshape((n_bins, n_spots), order="F") # construct single_base_nb_mean, lengths - single_base_nb_mean = table_bininfo["NORMAL_COUNT"].values.reshape(-1,1) / np.sum(table_bininfo["NORMAL_COUNT"].values) @ np.sum(single_X[:,0,:], axis=0).reshape(1,-1) + single_base_nb_mean = ( + table_bininfo["NORMAL_COUNT"].values.reshape(-1, 1) + / np.sum(table_bininfo["NORMAL_COUNT"].values) + @ np.sum(single_X[:, 0, :], axis=0).reshape(1, -1) + ) # construct single_total_bb_RD - single_total_bb_RD = table_rdrbaf["TOT"].values.reshape((n_bins, n_spots), order="F") + single_total_bb_RD = table_rdrbaf["TOT"].values.reshape( + (n_bins, n_spots), order="F" + ) # construct log_sitewise_transmat log_sitewise_transmat = table_bininfo["LOG_PHASE_TRANSITION"].values # construct bin info and lengths and x_gene_list df_bininfo = table_bininfo - lengths = np.array([ np.sum(table_bininfo.CHR == c) for c in df_bininfo.CHR.unique() ]) - + lengths = np.array( + [np.sum(table_bininfo.CHR == c) for c in df_bininfo.CHR.unique()] + ) + # construct barcodes barcodes = table_meta["BARCODES"] @@ -206,49 +396,109 @@ def load_tables_to_matrices(config): coords = table_meta[["X", "Y"]].values # construct single_tumor_prop - single_tumor_prop = table_meta["TUMOR_PROPORTION"].values if "TUMOR_PROPORTION" in table_meta.columns else None + single_tumor_prop = ( + table_meta["TUMOR_PROPORTION"].values + if "TUMOR_PROPORTION" in table_meta.columns + else None + ) # construct sample_list and sample_ids sample_list = [table_meta["SAMPLE"].values[0]] for i in range(1, table_meta.shape[0]): if table_meta["SAMPLE"].values[i] != sample_list[-1]: - sample_list.append( table_meta["SAMPLE"].values[i] ) + sample_list.append(table_meta["SAMPLE"].values[i]) sample_ids = np.zeros(table_meta.shape[0], dtype=int) - for s,sname in enumerate(sample_list): + for s, sname in enumerate(sample_list): index = np.where(table_meta["SAMPLE"].values == sname)[0] sample_ids[index] = s # expression UMI count matrix - exp_counts = pd.read_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" ) - - return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ - barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts + exp_counts = pd.read_pickle(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl") + + return ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + df_bininfo, + df_gene_snp, + barcodes, + coords, + single_tumor_prop, + sample_list, + sample_ids, + adjacency_mat, + smooth_mat, + exp_counts, + ) def run_parse_n_load(config): - file_exists = np.array([ Path(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz").exists(), \ - Path(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz").exists(), \ - Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(), \ - Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(), \ - Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(), \ - Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists() ]) + file_exists = np.array( + [ + Path(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz").exists(), + Path(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz").exists(), + Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(), + Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(), + Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(), + Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists(), + ] + ) if not np.all(file_exists): # process to tables - table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp = parse_visium(config) + ( + table_bininfo, + table_rdrbaf, + table_meta, + exp_counts, + adjacency_mat, + smooth_mat, + df_gene_snp, + ) = parse_visium(config) # table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat = parse_hatchetblock(config, cellsnplite_dir, bb_file) # save file - p = subprocess.Popen(f"mkdir -p {config['output_dir']}/parsed_inputs", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out,err = p.communicate() - - table_bininfo.to_csv( f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=True, index=False, sep="\t" ) - table_rdrbaf.to_csv( f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=True, index=False, sep="\t" ) - table_meta.to_csv( f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=True, index=False, sep="\t" ) - exp_counts.to_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" ) - scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat ) - scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat ) + p = subprocess.Popen( + f"mkdir -p {config['output_dir']}/parsed_inputs", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) + out, err = p.communicate() + + table_bininfo.to_csv( + f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", + header=True, + index=False, + sep="\t", + ) + table_rdrbaf.to_csv( + f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", + header=True, + index=False, + sep="\t", + ) + table_meta.to_csv( + f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", + header=True, + index=False, + sep="\t", + ) + exp_counts.to_pickle(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl") + scipy.sparse.save_npz( + f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat + ) + scipy.sparse.save_npz( + f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat + ) # - df_gene_snp.to_csv( f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=True, index=False, sep="\t" ) + df_gene_snp.to_csv( + f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", + header=True, + index=False, + sep="\t", + ) # load and parse data return load_tables_to_matrices(config) @@ -256,7 +506,13 @@ def run_parse_n_load(config): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) + parser.add_argument( + "-c", + "--configfile", + help="configuration file of CalicoST", + required=True, + type=str, + ) args = parser.parse_args() try: diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py index d582ec5..e4c9447 100644 --- a/src/calicost/phasing.py +++ b/src/calicost/phasing.py @@ -19,26 +19,60 @@ from statsmodels.tools.sm_exceptions import ValueWarning -def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_RD, n_states, log_sitewise_transmat, \ - params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol): +def infer_initial_phase( + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + n_states, + log_sitewise_transmat, + params, + t, + random_state, + fix_NB_dispersion, + shared_NB_dispersion, + fix_BB_dispersion, + shared_BB_dispersion, + max_iter, + tol, +): # pseudobulk HMM for phase_prob - res = pipeline_baum_welch(None, np.sum(single_X, axis=2, keepdims=True), lengths, n_states, \ - np.sum(single_base_nb_mean, axis=1, keepdims=True), np.sum(single_total_bb_RD, axis=1, keepdims=True), log_sitewise_transmat, \ - hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \ - init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol) + res = pipeline_baum_welch( + None, + np.sum(single_X, axis=2, keepdims=True), + lengths, + n_states, + np.sum(single_base_nb_mean, axis=1, keepdims=True), + np.sum(single_total_bb_RD, axis=1, keepdims=True), + log_sitewise_transmat, + hmmclass=hmm_sitewise, + params=params, + t=t, + random_state=random_state, + only_minor=True, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=True, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + max_iter=max_iter, + tol=tol, + ) # phase_prob = np.exp(scipy.special.logsumexp(res["log_gamma"][:n_states, :], axis=0)) # return phase_prob pred = np.argmax(res["log_gamma"], axis=0) pred_cnv = pred % n_states - phase_indicator = (pred < n_states) + phase_indicator = pred < n_states refined_lengths = [] cumlen = 0 for le in lengths: s = 0 - for i, k in enumerate(pred_cnv[cumlen:(cumlen+le)]): - if i > 0 and pred_cnv[i] != pred_cnv[i-1]: + for i, k in enumerate(pred_cnv[cumlen : (cumlen + le)]): + if i > 0 and pred_cnv[i] != pred_cnv[i - 1]: refined_lengths.append(i - s) s = i refined_lengths.append(le - s) @@ -47,48 +81,119 @@ def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_ return phase_indicator, refined_lengths -def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ - params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol, threshold, min_snpumi=2e3): +def initial_phase_given_partition( + single_X, + lengths, + single_base_nb_mean, + single_total_bb_RD, + single_tumor_prop, + initial_clone_index, + n_states, + log_sitewise_transmat, + params, + t, + random_state, + fix_NB_dispersion, + shared_NB_dispersion, + fix_BB_dispersion, + shared_BB_dispersion, + max_iter, + tol, + threshold, + min_snpumi=2e3, +): EPS_BAF = 0.05 if single_tumor_prop is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=threshold) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + initial_clone_index, + single_tumor_prop, + threshold=threshold, + ) # pseudobulk HMM for phase_prob baf_profiles = np.zeros((X.shape[2], X.shape[0])) pred_cnv = np.zeros((X.shape[2], X.shape[0])) for i in range(X.shape[2]): - if np.sum(total_bb_RD[:,i]) < min_snpumi: - baf_profiles[i,:] = 0.5 + if np.sum(total_bb_RD[:, i]) < min_snpumi: + baf_profiles[i, :] = 0.5 else: - res = pipeline_baum_welch(None, X[:,:,i:(i+1)], lengths, n_states, base_nb_mean[:,i:(i+1)], total_bb_RD[:,i:(i+1)], log_sitewise_transmat, \ - hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \ - fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ - fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \ - init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol) + res = pipeline_baum_welch( + None, + X[:, :, i : (i + 1)], + lengths, + n_states, + base_nb_mean[:, i : (i + 1)], + total_bb_RD[:, i : (i + 1)], + log_sitewise_transmat, + hmmclass=hmm_sitewise, + params=params, + t=t, + random_state=random_state, + only_minor=True, + fix_NB_dispersion=fix_NB_dispersion, + shared_NB_dispersion=shared_NB_dispersion, + fix_BB_dispersion=fix_BB_dispersion, + shared_BB_dispersion=shared_BB_dispersion, + is_diag=True, + init_log_mu=None, + init_p_binom=None, + init_alphas=None, + init_taus=None, + max_iter=max_iter, + tol=tol, + ) # pred = np.argmax(res["log_gamma"], axis=0) - this_baf_profiles = np.where(pred < n_states, res["new_p_binom"][pred%n_states, 0], 1-res["new_p_binom"][pred%n_states, 0]) + this_baf_profiles = np.where( + pred < n_states, + res["new_p_binom"][pred % n_states, 0], + 1 - res["new_p_binom"][pred % n_states, 0], + ) this_baf_profiles[np.abs(this_baf_profiles - 0.5) < EPS_BAF] = 0.5 - baf_profiles[i,:] = this_baf_profiles - pred_cnv[i,:] = (pred % n_states) + baf_profiles[i, :] = this_baf_profiles + pred_cnv[i, :] = pred % n_states if single_tumor_prop is None: - n_total_spots = np.sum([ len(x) for x in initial_clone_index ]) - population_baf = np.array([ 1.0*len(x)/n_total_spots for x in initial_clone_index]) @ baf_profiles + n_total_spots = np.sum([len(x) for x in initial_clone_index]) + population_baf = ( + np.array([1.0 * len(x) / n_total_spots for x in initial_clone_index]) + @ baf_profiles + ) else: - n_total_spots = np.sum([ len(x) * tumor_prop[i] for i,x in enumerate(initial_clone_index) ]) - population_baf = np.array([ 1.0*len(x)*tumor_prop[i]/n_total_spots for i,x in enumerate(initial_clone_index) ]) @ baf_profiles - adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1-baf_profiles) - phase_indicator = (population_baf < 0.5) + n_total_spots = np.sum( + [len(x) * tumor_prop[i] for i, x in enumerate(initial_clone_index)] + ) + population_baf = ( + np.array( + [ + 1.0 * len(x) * tumor_prop[i] / n_total_spots + for i, x in enumerate(initial_clone_index) + ] + ) + @ baf_profiles + ) + adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1 - baf_profiles) + phase_indicator = population_baf < 0.5 refined_lengths = [] cumlen = 0 for le in lengths: s = 0 for i in range(le): - if i > s + 10 and np.any(np.abs(adj_baf_profiles[:,i+cumlen] - adj_baf_profiles[:,i+cumlen-1]) > 0.1): + if i > s + 10 and np.any( + np.abs( + adj_baf_profiles[:, i + cumlen] + - adj_baf_profiles[:, i + cumlen - 1] + ) + > 0.1 + ): refined_lengths.append(i - s) s = i refined_lengths.append(le - s) @@ -99,13 +204,21 @@ def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, threshold): initial_clone_index = [] - for s in range(np.max(sample_ids)+1): + for s in range(np.max(sample_ids) + 1): index = np.where(sample_ids == s)[0] assert len(index) > 0 if single_tumor_prop is None: - tmp_clone_index = fixed_rectangle_initialization(coords[index,:], x_part, y_part) + tmp_clone_index = fixed_rectangle_initialization( + coords[index, :], x_part, y_part + ) else: - tmp_clone_index = fixed_rectangle_initialization_mix(coords[index,:], x_part, y_part, single_tumor_prop[index], threshold=threshold) + tmp_clone_index = fixed_rectangle_initialization_mix( + coords[index, :], + x_part, + y_part, + single_tumor_prop[index], + threshold=threshold, + ) for x in tmp_clone_index: - initial_clone_index.append( index[x] ) + initial_clone_index.append(index[x]) return initial_clone_index diff --git a/src/calicost/phylogeny_startle.py b/src/calicost/phylogeny_startle.py index 9265224..2b916e9 100644 --- a/src/calicost/phylogeny_startle.py +++ b/src/calicost/phylogeny_startle.py @@ -28,32 +28,36 @@ def get_LoH_for_phylogeny(df_seglevel_cnv, min_segments): ---------- df_loh : pd.DataFrame, (n_clones, n_segments) """ + def get_shared_intervals(acn_profile): - ''' + """ Takes in allele-specific copy numbers, output a segmentation of genome such that all clones are in the same CN state within each segment. anc_profile : array, (n_obs, 2*n_clones) Allele-specific integer copy numbers for each genomic bin (obs) across all clones. - ''' + """ intervals = [] seg_acn = [] s = 0 while s < acn_profile.shape[0]: - t = np.where( ~np.all(acn_profile[s:,] == acn_profile[s,:], axis=1) )[0] + t = np.where(~np.all(acn_profile[s:,] == acn_profile[s, :], axis=1))[0] if len(t) == 0: - intervals.append( (s, acn_profile.shape[0]) ) - seg_acn.append( acn_profile[s,:] ) + intervals.append((s, acn_profile.shape[0])) + seg_acn.append(acn_profile[s, :]) s = acn_profile.shape[0] else: t = t[0] - intervals.append( (s,s+t) ) - seg_acn.append( acn_profile[s,:] ) - s = s+t + intervals.append((s, s + t)) + seg_acn.append(acn_profile[s, :]) + s = s + t return intervals, seg_acn - - clone_ids = [x.split(" ")[0] for x in df_seglevel_cnv.columns[ np.arange(3, df_seglevel_cnv.shape[1], 2) ] ] - - acn_profile = df_seglevel_cnv.iloc[:,3:].values + + clone_ids = [ + x.split(" ")[0] + for x in df_seglevel_cnv.columns[np.arange(3, df_seglevel_cnv.shape[1], 2)] + ] + + acn_profile = df_seglevel_cnv.iloc[:, 3:].values intervals, seg_acn = get_shared_intervals(acn_profile) df_loh = [] for i, acn in enumerate(seg_acn): @@ -63,18 +67,24 @@ def get_shared_intervals(acn_profile): continue idx_zero = np.where(acn == 0)[0] idx_clones = (idx_zero / 2).astype(int) - is_A = (idx_zero % 2 == 0) + is_A = idx_zero % 2 == 0 # vector of mutation states - mut = np.zeros( int(len(acn) / 2), dtype=int ) + mut = np.zeros(int(len(acn) / 2), dtype=int) mut[idx_clones] = np.where(is_A, 1, 2) - df_loh.append( pd.DataFrame(mut.reshape(1, -1), index=[f"bin_{intervals[i][0]}_{intervals[i][1]}"], columns=clone_ids) ) + df_loh.append( + pd.DataFrame( + mut.reshape(1, -1), + index=[f"bin_{intervals[i][0]}_{intervals[i][1]}"], + columns=clone_ids, + ) + ) df_loh = pd.concat(df_loh).T return df_loh def get_binary_matrix(df_character_matrix): - + ncells = len(df_character_matrix) binary_col_dict = {} for column in df_character_matrix.columns: @@ -85,38 +95,40 @@ def get_binary_matrix(df_character_matrix): state_col[df_character_matrix[column] == s] = 1 state_col[df_character_matrix[column] == -1] = -1 - binary_col_dict[f'{column}_{s}'] = state_col + binary_col_dict[f"{column}_{s}"] = state_col - df_binary = pd.DataFrame(binary_col_dict, index = df_character_matrix.index, dtype=int) + df_binary = pd.DataFrame( + binary_col_dict, index=df_character_matrix.index, dtype=int + ) return df_binary def generate_perfect_phylogeny(df_binary): solT_mut = nx.DiGraph() - solT_mut.add_node('root') + solT_mut.add_node("root") solT_cell = nx.DiGraph() - solT_cell.add_node('root') + solT_cell.add_node("root") - df_binary = df_binary[df_binary.sum().sort_values(ascending=False).index] + df_binary = df_binary[df_binary.sum().sort_values(ascending=False).index] for cell_id, row in df_binary.iterrows(): - if cell_id == 'root': + if cell_id == "root": continue - curr_node = 'root' + curr_node = "root" for column in df_binary.columns[row.values == 1]: if column in solT_mut[curr_node]: curr_node = column else: if column in solT_mut.nodes: - raise NameError(f'{column} is being repeated') + raise NameError(f"{column} is being repeated") solT_mut.add_edge(curr_node, column) solT_cell.add_edge(curr_node, column) curr_node = column - solT_cell.add_edge(curr_node, cell_id) + solT_cell.add_edge(curr_node, cell_id) return solT_mut, solT_cell @@ -138,17 +150,21 @@ def tree_to_newick(T, root=None): pathlen += 1 subgs.append(tree_to_newick(T, root=child) + f":{pathlen}") else: - subgs.append( f"{child}:{pathlen}" ) - return "(" + ','.join(map(str, subgs)) + ")" + subgs.append(f"{child}:{pathlen}") + return "(" + ",".join(map(str, subgs)) + ")" -def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="startle", min_segments=3): +def output_startle_input_files( + calicostdir, outdir, midfix="", startle_bin="startle", min_segments=3 +): # get LoH data frame # rows are clones, columns are bins, entries are 0 (no LoH) or 1 (A allele LoH) of 2 (B allele LoH) - df_seglevel_cnv = pd.read_csv(f"{calicostdir}/cnv{midfix}_seglevel.tsv", header=0, sep="\t") + df_seglevel_cnv = pd.read_csv( + f"{calicostdir}/cnv{midfix}_seglevel.tsv", header=0, sep="\t" + ) df_loh = get_LoH_for_phylogeny(df_seglevel_cnv, min_segments) df_loh.to_csv(f"{outdir}/loh_matrix.tsv", header=True, index=True, sep="\t") - + # binarize df_binary = get_binary_matrix(df_loh) @@ -163,36 +179,40 @@ def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="star for mut_idx, mut in enumerate(mutation_list): if df_binary.loc[cell][mut] == 1: one_cell_mut_list.append((cell_idx, mut_idx)) - with open(f'{outdir}/loh_one_indices.txt', 'w') as out: + with open(f"{outdir}/loh_one_indices.txt", "w") as out: for cell_idx, mut_idx in one_cell_mut_list: - out.write(f'{cell_idx} {mut_idx}\n') + out.write(f"{cell_idx} {mut_idx}\n") # missimg imdices - character_list = list(set(['_'.join(x.split('_')[:-1]) for x in df_binary.columns])) + character_list = list(set(["_".join(x.split("_")[:-1]) for x in df_binary.columns])) missing_cell_character_list = [] for character_idx, character in enumerate(character_list): for cell_idx, cell in enumerate(cell_list): if df_loh.loc[cell][character] == -1: missing_cell_character_list.append((cell_idx, character_idx)) - with open(f'{outdir}/loh_missing_indices.txt', 'w') as out: + with open(f"{outdir}/loh_missing_indices.txt", "w") as out: for cell_idx, character_idx in missing_cell_character_list: - out.write(f'{cell_idx} {character_idx}\n') + out.write(f"{cell_idx} {character_idx}\n") # character mutation mapping - with open(f'{outdir}/loh_character_mutation_mapping.txt', 'w') as out: + with open(f"{outdir}/loh_character_mutation_mapping.txt", "w") as out: for _, character in enumerate(character_list): - character_mutation_list = [mutation_to_index[x] for x in mutation_list if x.startswith(f'{character}_')] - out.write(' '.join(map(str, character_mutation_list)) + '\n') + character_mutation_list = [ + mutation_to_index[x] + for x in mutation_list + if x.startswith(f"{character}_") + ] + out.write(" ".join(map(str, character_mutation_list)) + "\n") # count of character states of mutations max_allowed_homoplasy = {} for mutation in mutation_list: max_allowed_homoplasy[mutation] = 2 - with open(f'{outdir}/loh_counts.txt', 'w') as out: + with open(f"{outdir}/loh_counts.txt", "w") as out: for mutation in mutation_list: - out.write(f'{max_allowed_homoplasy[mutation]}\n') - + out.write(f"{max_allowed_homoplasy[mutation]}\n") + # weights - with open(f'{outdir}/loh_weights.txt', 'w') as out: + with open(f"{outdir}/loh_weights.txt", "w") as out: for mutation in mutation_list: out.write(f"1\n") @@ -200,35 +220,71 @@ def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="star m_mutations = df_binary.shape[1] n_clones = df_binary.shape[0] command = f"{startle_bin} -m {m_mutations} -n {n_clones} {outdir}/loh_one_indices.txt {outdir}/loh_missing_indices.txt {outdir}/loh_counts.txt {outdir}/loh_character_mutation_mapping.txt {outdir}/loh_weights.txt {outdir}/loh_cpp_output.txt" - print( command ) - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out,err = p.communicate() + print(command) + p = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True + ) + out, err = p.communicate() # parse output - df_cpp_output = pd.read_csv(f'{outdir}/loh_cpp_output.txt', header=None, sep=' ') - df_cpp_output = df_cpp_output.rename(columns={0:'cell_idx', 1:'mut_idx', 2:'state_idx', 3:'entry'}) - df_cpp_output['name'] = df_cpp_output.apply(lambda x: f"{mutation_list[x['mut_idx']]}_{x['state_idx']}", axis =1) - - sol_columns = list(df_cpp_output['name'].unique()) + df_cpp_output = pd.read_csv(f"{outdir}/loh_cpp_output.txt", header=None, sep=" ") + df_cpp_output = df_cpp_output.rename( + columns={0: "cell_idx", 1: "mut_idx", 2: "state_idx", 3: "entry"} + ) + df_cpp_output["name"] = df_cpp_output.apply( + lambda x: f"{mutation_list[x['mut_idx']]}_{x['state_idx']}", axis=1 + ) + + sol_columns = list(df_cpp_output["name"].unique()) nsol_columns = len(sol_columns) sol_entries = np.zeros((n_clones, nsol_columns), dtype=int) for mut_idx, mut in enumerate(sol_columns): - for cell_idx in df_cpp_output[(df_cpp_output['entry'] == 1) & (df_cpp_output['name'] == mut)]['cell_idx']: + for cell_idx in df_cpp_output[ + (df_cpp_output["entry"] == 1) & (df_cpp_output["name"] == mut) + ]["cell_idx"]: sol_entries[cell_idx][mut_idx] = 1 df_sol_binary = pd.DataFrame(sol_entries, columns=sol_columns, index=cell_list) solT_mut, solT_cell = generate_perfect_phylogeny(df_sol_binary) - with open(f'{outdir}/loh_tree.newick', 'w') as out: + with open(f"{outdir}/loh_tree.newick", "w") as out: out.write(f"{tree_to_newick(solT_cell)};") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-c", "--calicost_dir", help="Directory of a specific random initialization of CalicoST", type=str) - parser.add_argument("-s", "--startle_bin", help="The startle executable path", default="startle", type=str) - parser.add_argument("-p", "--ploidy", help="Ploidy of allele-specific integer copy numbers.", default="", type=str) - parser.add_argument("--min_segments", help="Minimum number of genome segment to keep an LOH event in phylogenetic tree reconstruction.", default=3, type=int) + parser.add_argument( + "-c", + "--calicost_dir", + help="Directory of a specific random initialization of CalicoST", + type=str, + ) + parser.add_argument( + "-s", + "--startle_bin", + help="The startle executable path", + default="startle", + type=str, + ) + parser.add_argument( + "-p", + "--ploidy", + help="Ploidy of allele-specific integer copy numbers.", + default="", + type=str, + ) + parser.add_argument( + "--min_segments", + help="Minimum number of genome segment to keep an LOH event in phylogenetic tree reconstruction.", + default=3, + type=int, + ) parser.add_argument("-o", "--outputdir", help="output directory", type=str) args = parser.parse_args() - output_startle_input_files(args.calicost_dir, args.outputdir, midfix=args.ploidy, startle_bin=args.startle_bin, min_segments=args.min_segments) \ No newline at end of file + output_startle_input_files( + args.calicost_dir, + args.outputdir, + midfix=args.ploidy, + startle_bin=args.startle_bin, + min_segments=args.min_segments, + ) diff --git a/src/calicost/phylogeography.py b/src/calicost/phylogeography.py index e859350..8b188f2 100644 --- a/src/calicost/phylogeography.py +++ b/src/calicost/phylogeography.py @@ -8,40 +8,66 @@ import networkx as nx -def clone_centers(coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None, tumorprop_threshold=0.6): +def clone_centers( + coords, + clone_label, + single_tumor_prop=None, + sample_list=None, + sample_ids=None, + tumorprop_threshold=0.6, +): df_centers = [] for l in np.unique(clone_label): # get spot indices of this clone - index = np.where(clone_label == l)[0] if single_tumor_prop is None else np.where((clone_label == l) & (single_tumor_prop > tumorprop_threshold))[0] + index = ( + np.where(clone_label == l)[0] + if single_tumor_prop is None + else np.where( + (clone_label == l) & (single_tumor_prop > tumorprop_threshold) + )[0] + ) # if the index contains multiple slices, get the most abundance slice if not sample_ids is None: most_abundance_slice = pd.Series(sample_ids[index]).mode().values[0] - index = index[ sample_ids[index] == most_abundance_slice ] + index = index[sample_ids[index] == most_abundance_slice] # get clone cencer if single_tumor_prop is None: center = np.mean(coords[index], axis=0) else: - center = single_tumor_prop[index].dot(coords[index]) / np.sum(single_tumor_prop[index]) - df_centers.append( pd.DataFrame({'clone':l, 'x':center[0], 'y':center[1]}, index=[0]) ) + center = single_tumor_prop[index].dot(coords[index]) / np.sum( + single_tumor_prop[index] + ) + df_centers.append( + pd.DataFrame({"clone": l, "x": center[0], "y": center[1]}, index=[0]) + ) df_centers = pd.concat(df_centers, ignore_index=True) return df_centers -def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None): +def project_phylogeneny_space( + newick_file, + coords, + clone_label, + single_tumor_prop=None, + sample_list=None, + sample_ids=None, +): # load tree - with open(newick_file, 'r') as fp: + with open(newick_file, "r") as fp: t = Tree(fp.readline()) - - # get the + + # get the list_leaf_nodes = [] list_internal_nodes = [] - rootnode = np.sort( [leaf.name.replace('clone','') for leaf in t.iter_leaves() ] ) - rootnode = "ancestor" + "_".join( rootnode ) + rootnode = np.sort([leaf.name.replace("clone", "") for leaf in t.iter_leaves()]) + rootnode = "ancestor" + "_".join(rootnode) for node in t.traverse(): - leafnames = np.sort( [leaf.name.replace('clone','') for leaf in node.iter_leaves() ] ) + leafnames = np.sort( + [leaf.name.replace("clone", "") for leaf in node.iter_leaves()] + ) if node.name == "": - node.name = "ancestor" + "_".join( leafnames ) - + node.name = "ancestor" + "_".join(leafnames) + if node.is_leaf(): list_leaf_nodes.append(node.name) else: @@ -50,27 +76,27 @@ def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_pro print(f"root node is {rootnode}") print(f"a list of leaf nodes: {list_leaf_nodes}") print(f"a list of internal nodes: {list_internal_nodes}") - + # set up multivariate Gaussian distribution to estimate internal node location N_nodes = len(list_leaf_nodes) + len(list_internal_nodes) # pairwise distance G = nx.Graph() - G.add_nodes_from( list_leaf_nodes + list_internal_nodes ) + G.add_nodes_from(list_leaf_nodes + list_internal_nodes) for nodename in list_leaf_nodes: - node = t&f"{nodename}" + node = t & f"{nodename}" while not node.is_root(): p = node.up G.add_edge(node.name, p.name, weight=node.dist) node = p - + G.edges(data=True) - nx_pdc = dict( nx.all_pairs_dijkstra(G) ) + nx_pdc = dict(nx.all_pairs_dijkstra(G)) # covariance matrix based on pairwise distance N_nodes = len(list_leaf_nodes) + len(list_internal_nodes) Sigma_square = np.zeros((N_nodes, N_nodes)) - base_var = max( np.max(np.abs(coords[:,0])), np.max(np.abs(coords[:,1])) ) - + base_var = max(np.max(np.abs(coords[:, 0])), np.max(np.abs(coords[:, 1]))) + for n1, name1 in enumerate(list_leaf_nodes + list_internal_nodes): for n2, name2 in enumerate(list_leaf_nodes + list_internal_nodes): if n1 == n2: @@ -84,26 +110,42 @@ def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_pro Sigma_square[n1, n2] = base_var + nx_pdc[rootnode][0][lca_node.name] # mean position - mu_1 = np.zeros(( len(list_leaf_nodes),2 )) - mu_2 = np.zeros(( len(list_internal_nodes),2 )) + mu_1 = np.zeros((len(list_leaf_nodes), 2)) + mu_2 = np.zeros((len(list_internal_nodes), 2)) # partition covariance matrix - Sigma_11 = Sigma_square[:len(list_leaf_nodes), :len(list_leaf_nodes)] - Sigma_12 = Sigma_square[:len(list_leaf_nodes), :][:, len(list_leaf_nodes):] - Sigma_22 = Sigma_square[len(list_leaf_nodes):, len(list_leaf_nodes):] + Sigma_11 = Sigma_square[: len(list_leaf_nodes), : len(list_leaf_nodes)] + Sigma_12 = Sigma_square[: len(list_leaf_nodes), :][:, len(list_leaf_nodes) :] + Sigma_22 = Sigma_square[len(list_leaf_nodes) :, len(list_leaf_nodes) :] # get leaf node locations - df_centers = clone_centers(coords, clone_label, single_tumor_prop=single_tumor_prop, - sample_list=sample_list, sample_ids=sample_ids) - obs_1 = df_centers.set_index('clone').loc[list_leaf_nodes].values + df_centers = clone_centers( + coords, + clone_label, + single_tumor_prop=single_tumor_prop, + sample_list=sample_list, + sample_ids=sample_ids, + ) + obs_1 = df_centers.set_index("clone").loc[list_leaf_nodes].values # conditional expectation internal node position | leaf node position = mu_1 expected_internal = mu_2 + Sigma_12.T @ (np.linalg.inv(Sigma_11) @ (obs_1 - mu_1)) - df_centers = pd.concat([ df_centers, pd.DataFrame({'clone':list_internal_nodes, 'x':expected_internal[:,0], 'y':expected_internal[:,1]}) ]) + df_centers = pd.concat( + [ + df_centers, + pd.DataFrame( + { + "clone": list_internal_nodes, + "x": expected_internal[:, 0], + "y": expected_internal[:, 1], + } + ), + ] + ) # add to tree features for node in t.traverse(): i = np.where(df_centers.clone.values == node.name)[0][0] - node.add_features( x=df_centers.x.values[i], y=df_centers.y.values[i] ) + node.add_features(x=df_centers.x.values[i], y=df_centers.y.values[i]) - return t \ No newline at end of file + return t diff --git a/src/calicost/simple_sctransform.py b/src/calicost/simple_sctransform.py index 1a011b1..ca7666c 100644 --- a/src/calicost/simple_sctransform.py +++ b/src/calicost/simple_sctransform.py @@ -7,42 +7,70 @@ # copied from sctransformPy -def theta_ml(y,mu): +def theta_ml(y, mu): n = y.size weights = np.ones(n) limit = 10 _EPS = np.finfo(float).eps - eps = (_EPS)**0.25 + eps = (_EPS) ** 0.25 + # inner function - def score(n,th,mu,y,w): - return sum(w*(psi(th + y) - psi(th) + np.log(th) + 1 - np.log(th + mu) - (y + th)/(mu + th))) + def score(n, th, mu, y, w): + return sum( + w + * ( + psi(th + y) + - psi(th) + + np.log(th) + + 1 + - np.log(th + mu) + - (y + th) / (mu + th) + ) + ) + # inner function - def info(n,th,mu,y,w): - return sum(w*( - polygamma(1,th + y) + polygamma(1,th) - 1/th + 2/(mu + th) - (y + th)/(mu + th)**2)) + def info(n, th, mu, y, w): + return sum( + w + * ( + -polygamma(1, th + y) + + polygamma(1, th) + - 1 / th + + 2 / (mu + th) + - (y + th) / (mu + th) ** 2 + ) + ) + # initialize gradient descent - t0 = n/sum(weights*(y/mu - 1)**2) + t0 = n / sum(weights * (y / mu - 1) ** 2) it = 0 de = 1 # gradient descent - while(it + 1 < limit and abs(de) > eps): - it+=1 + while it + 1 < limit and abs(de) > eps: + it += 1 t0 = abs(t0) i = info(n, t0, mu, y, weights) - de = score(n, t0, mu, y, weights)/i - t0 += de - t0 = max(t0,0) + de = score(n, t0, mu, y, weights) / i + t0 += de + t0 = max(t0, 0) # note that t0 is the dispersion parameter: var = mu + mu^2 / t0 return t0 def sample_gene_indices(log_geometric_mean, n_subsample, n_partitions=10): - bounds = np.linspace(np.min(log_geometric_mean), np.max(log_geometric_mean), n_partitions+1) + bounds = np.linspace( + np.min(log_geometric_mean), np.max(log_geometric_mean), n_partitions + 1 + ) bounds[-1] += 1e-4 idx_subsample = [] for p in range(1, n_partitions): - tmpidx = np.where(np.logical_and(log_geometric_mean >= bounds[p-1], log_geometric_mean < bounds[p]))[0] + tmpidx = np.where( + np.logical_and( + log_geometric_mean >= bounds[p - 1], log_geometric_mean < bounds[p] + ) + )[0] np.random.shuffle(tmpidx) - idx_subsample.append(tmpidx[:int(n_subsample/n_partitions)]) + idx_subsample.append(tmpidx[: int(n_subsample / n_partitions)]) idx_subsample = np.sort(np.concatenate(idx_subsample)) if len(idx_subsample) < n_subsample: mask = np.array([True] * len(log_geometric_mean)) @@ -55,120 +83,128 @@ def sample_gene_indices(log_geometric_mean, n_subsample, n_partitions=10): def estimate_logmu_dispersion(counts, bw=None): - ''' + """ counts of size number spots * number genes. - ''' + """ N = counts.shape[0] G = counts.shape[1] eps = 1 - geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps - log_geometric_mean = np.log( geometric_mean ) + geometric_mean = np.exp(np.log(counts + eps).mean(axis=0).flatten()) - eps + log_geometric_mean = np.log(geometric_mean) spot_umi = counts.sum(axis=1) # fitting logmu and theta (dispersion) logmu = np.zeros(G) theta = np.zeros(G) for i in range(G): - y = counts[:,i] - logmu[i] = np.log( np.sum(y) / np.sum(spot_umi) ) + y = counts[:, i] + logmu[i] = np.log(np.sum(y) / np.sum(spot_umi)) mu = spot_umi * np.exp(logmu[i]) theta[i] = theta_ml(y, mu) # ratio between geometric mean and dispersion parameter theta log_ratio = np.log(1 + geometric_mean / theta) # smoothing parameter for kernel ridge regression if bw is None: - z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean) - z.evaluate(); + z = FFTKDE(kernel="gaussian", bw="ISJ").fit(log_geometric_mean) + z.evaluate() bw_adjust = 3 - bw = z.bw*bw_adjust + bw = z.bw * bw_adjust # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion) - kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[:,None], ['c'], reg_type='ll', bw=[bw]) - pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0] + kr = statsmodels.nonparametric.kernel_regression.KernelReg( + log_ratio, log_geometric_mean[:, None], ["c"], reg_type="ll", bw=[bw] + ) + pred_log_ratio = kr.fit(data_predict=log_geometric_mean[:, None])[0] pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1) return logmu, pred_theta def pearson_residual(counts, logmu, pred_theta): - ''' + """ counts of size number spots * number genes. - ''' + """ N = counts.shape[0] G = counts.shape[1] spot_umi = counts.sum(axis=1) # predicted mean and variance under NB model - mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1) - vard = mud + mud**2 / pred_theta.reshape(1,-1) + mud = np.exp(logmu.reshape(1, -1)) * spot_umi.reshape(-1, 1) + vard = mud + mud**2 / pred_theta.reshape(1, -1) X = (counts * 1.0 - mud) / vard**0.5 # clipping - clip = np.sqrt(counts.shape[0]/30) + clip = np.sqrt(counts.shape[0] / 30) X[X > clip] = clip X[X < -clip] = -clip return X def deviance_residual(counts, logmu, pred_theta): - ''' + """ Equation is taken from Analytic Pearson Residual paper by Lause et al. counts of size number spots * number genes. - ''' + """ N = counts.shape[0] G = counts.shape[1] spot_umi = counts.sum(axis=1) # predicted mean - mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1) - sign = (counts > mud) + mud = np.exp(logmu.reshape(1, -1)) * spot_umi.reshape(-1, 1) + sign = counts > mud part1 = counts * np.log(counts / mud) - part1[counts==0] = 0 - part2 = (counts + pred_theta) * np.log( (counts + pred_theta) / (mud + pred_theta) ) + part1[counts == 0] = 0 + part2 = (counts + pred_theta) * np.log((counts + pred_theta) / (mud + pred_theta)) X = sign * np.sqrt(2 * (part1 - part2)) return X def estimate_logmu_dispersion2(counts, n_subsample=None, bw=None): - ''' + """ counts of size number spots * number genes. - ''' + """ N = counts.shape[0] G = counts.shape[1] eps = 1 - geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps - log_geometric_mean = np.log( geometric_mean ) + geometric_mean = np.exp(np.log(counts + eps).mean(axis=0).flatten()) - eps + log_geometric_mean = np.log(geometric_mean) spot_umi = counts.sum(axis=1) - logmu = np.log( np.sum(counts, axis=0) / np.sum(spot_umi) ) + logmu = np.log(np.sum(counts, axis=0) / np.sum(spot_umi)) # fitting theta (dispersion) genes_subsample = np.array([i for i in range(G) if geometric_mean[i] > 0]) if not (n_subsample is None): np.random.seed(0) genes_subsample = sample_gene_indices(log_geometric_mean, n_subsample) theta = np.zeros(len(genes_subsample)) - for idx,i in enumerate(genes_subsample): - y = counts[:,i] + for idx, i in enumerate(genes_subsample): + y = counts[:, i] mu = spot_umi * np.exp(logmu[i]) theta[idx] = theta_ml(y, mu) # ratio between geometric mean and dispersion parameter theta log_ratio = np.log(1 + geometric_mean[genes_subsample] / theta) # smoothing parameter for kernel ridge regression if bw is None: - z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean[genes_subsample]) - z.evaluate(); + z = FFTKDE(kernel="gaussian", bw="ISJ").fit(log_geometric_mean[genes_subsample]) + z.evaluate() bw_adjust = 3 - bw = z.bw*bw_adjust + bw = z.bw * bw_adjust # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion) - kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[genes_subsample][:,None], ['c'], reg_type='ll', bw=[bw]) - pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0] + kr = statsmodels.nonparametric.kernel_regression.KernelReg( + log_ratio, + log_geometric_mean[genes_subsample][:, None], + ["c"], + reg_type="ll", + bw=[bw], + ) + pred_log_ratio = kr.fit(data_predict=log_geometric_mean[:, None])[0] pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1) return logmu, pred_theta def pearson_residual2(counts, logmu, pred_theta): - ''' + """ counts of size number spots * number genes. - ''' + """ N = counts.shape[0] G = counts.shape[1] spot_umi = counts.sum(axis=1) # predicted mean and variance under NB model - mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1) - vard = mud + mud**2 / pred_theta.reshape(1,-1) + mud = np.exp(logmu.reshape(1, -1)) * spot_umi.reshape(-1, 1) + vard = mud + mud**2 / pred_theta.reshape(1, -1) X = (counts * 1.0 - mud) / vard**0.5 # clipping clip = np.sqrt(counts.shape[0]) diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index f248036..82138a2 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -11,7 +11,12 @@ import scanpy as sc import anndata import logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) logger = logging.getLogger() from calicost.utils_phase_switch import * @@ -19,28 +24,48 @@ import subprocess -def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, normalidx_file, min_snpumis=50, min_percent_expressed_spots=0.005): +def load_data( + spaceranger_dir, + snp_dir, + filtergenelist_file, + filterregion_file, + normalidx_file, + min_snpumis=50, + min_percent_expressed_spots=0.005, +): ##### read raw UMI count matrix ##### if Path(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5").exists(): adata = sc.read_10x_h5(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5") elif Path(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5ad").exists(): adata = sc.read_h5ad(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5ad") else: - logging.error(f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!") + logging.error( + f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!" + ) adata.layers["count"] = adata.X.A.astype(int) cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz") cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz") unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True) - snp_barcodes = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"]) + snp_barcodes = pd.read_csv( + f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"] + ) # add position if Path(f"{spaceranger_dir}/spatial/tissue_positions.csv").exists(): - df_pos = pd.read_csv(f"{spaceranger_dir}/spatial/tissue_positions.csv", sep=",", header=0, \ - names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + df_pos = pd.read_csv( + f"{spaceranger_dir}/spatial/tissue_positions.csv", + sep=",", + header=0, + names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"], + ) elif Path(f"{spaceranger_dir}/spatial/tissue_positions_list.csv").exists(): - df_pos = pd.read_csv(f"{spaceranger_dir}/spatial/tissue_positions_list.csv", sep=",", header=None, \ - names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + df_pos = pd.read_csv( + f"{spaceranger_dir}/spatial/tissue_positions_list.csv", + sep=",", + header=None, + names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"], + ) else: raise Exception("No spatial coordinate file!") df_pos = df_pos[df_pos.in_tissue == True] @@ -50,7 +75,9 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, adata = adata[adata.obs.index.isin(shared_barcodes), :] df_pos = df_pos[df_pos.barcode.isin(shared_barcodes)] # sort and match - df_pos.barcode = pd.Categorical(df_pos.barcode, categories=list(adata.obs.index), ordered=True) + df_pos.barcode = pd.Categorical( + df_pos.barcode, categories=list(adata.obs.index), ordered=True + ) df_pos.sort_values(by="barcode", inplace=True) adata.obsm["X_pos"] = np.vstack([df_pos.x, df_pos.y]).T @@ -60,114 +87,192 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, cell_snp_Ballele = cell_snp_Ballele[snp_barcodes.barcodes.isin(shared_barcodes), :] snp_barcodes = snp_barcodes[snp_barcodes.barcodes.isin(shared_barcodes)] adata = adata[adata.obs.index.isin(shared_barcodes), :] - adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ] + adata = adata[ + pd.Categorical( + adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True + ).argsort(), + :, + ] # filter out spots with too small number of UMIs - indicator = (np.sum(adata.layers["count"], axis=1) > min_snpumis) + indicator = np.sum(adata.layers["count"], axis=1) > min_snpumis adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] # filter out spots with too small number of SNP-covering UMIs - indicator = ( np.sum(cell_snp_Aallele, axis=1).A.flatten() + np.sum(cell_snp_Ballele, axis=1).A.flatten() >= min_snpumis ) + indicator = ( + np.sum(cell_snp_Aallele, axis=1).A.flatten() + + np.sum(cell_snp_Ballele, axis=1).A.flatten() + >= min_snpumis + ) adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] # filter out genes that are expressed in <0.5% cells - indicator = (np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0]).A.flatten() + indicator = ( + np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0] + ).A.flatten() genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] print(adata) - print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print( + "median UMI after filtering out genes < 0.5% of cells = {}".format( + np.median(np.sum(adata.layers["count"], axis=1)) + ) + ) # remove genes in filtergenelist_file # ig_gene_list = pd.read_csv("/n/fs/ragr-data/users/congma/references/cellranger_refdata-gex-GRCh38-2020-A/genes/ig_gene_list.txt", header=None) if not filtergenelist_file is None: filter_gene_list = pd.read_csv(filtergenelist_file, header=None) - filter_gene_list = set(list( filter_gene_list.iloc[:,0] )) - indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ]) + filter_gene_list = set(list(filter_gene_list.iloc[:, 0])) + indicator_filter = np.array( + [(not x in filter_gene_list) for x in adata.var.index] + ) adata = adata[:, indicator_filter] - print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print( + "median UMI after filtering out genes in filtergenelist_file = {}".format( + np.median(np.sum(adata.layers["count"], axis=1)) + ) + ) if not filterregion_file is None: - regions = pd.read_csv(filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"]) + regions = pd.read_csv( + filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"] + ) if "chr" in regions.Chrname.iloc[0]: regions["CHR"] = [int(x[3:]) for x in regions.Chrname.values] else: - regions.rename(columns={'Chrname':'CHR'}, inplace=True) + regions.rename(columns={"Chrname": "CHR"}, inplace=True) regions.sort_values(by=["CHR", "Start"], inplace=True) indicator_filter = np.array([True] * cell_snp_Aallele.shape[1]) j = 0 for i in range(cell_snp_Aallele.shape[1]): this_chr = int(unique_snp_ids[i].split("_")[0]) this_pos = int(unique_snp_ids[i].split("_")[1]) - while j < regions.shape[0] and ( (regions.CHR.values[j] < this_chr) or ((regions.CHR.values[j] == this_chr) and (regions.End.values[j] <= this_pos)) ): + while j < regions.shape[0] and ( + (regions.CHR.values[j] < this_chr) + or ( + (regions.CHR.values[j] == this_chr) + and (regions.End.values[j] <= this_pos) + ) + ): j += 1 - if j < regions.shape[0] and (regions.CHR.values[j] == this_chr) and (regions.Start.values[j] <= this_pos) and (regions.End.values[j] > this_pos): + if ( + j < regions.shape[0] + and (regions.CHR.values[j] == this_chr) + and (regions.Start.values[j] <= this_pos) + and (regions.End.values[j] > this_pos) + ): indicator_filter[i] = False cell_snp_Aallele = cell_snp_Aallele[:, indicator_filter] cell_snp_Ballele = cell_snp_Ballele[:, indicator_filter] unique_snp_ids = unique_snp_ids[indicator_filter] clf = LocalOutlierFactor(n_neighbors=200) - label = clf.fit_predict( np.sum(adata.layers["count"], axis=0).reshape(-1,1) ) - adata.layers["count"][:, np.where(label==-1)[0]] = 0 - print("filter out {} outlier genes.".format( np.sum(label==-1) )) + label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1)) + adata.layers["count"][:, np.where(label == -1)[0]] = 0 + print("filter out {} outlier genes.".format(np.sum(label == -1))) if not normalidx_file is None: - normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:,0].values + normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values adata.obs["tumor_annotation"] = "tumor" adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal" - print( adata.obs["tumor_annotation"].value_counts() ) - + print(adata.obs["tumor_annotation"].value_counts()) + return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids -def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_file, filterregion_file, normalidx_file, min_snpumis=50, min_percent_expressed_spots=0.005): +def load_joint_data( + input_filelist, + snp_dir, + alignment_files, + filtergenelist_file, + filterregion_file, + normalidx_file, + min_snpumis=50, + min_percent_expressed_spots=0.005, +): ##### read meta sample info ##### df_meta = pd.read_csv(input_filelist, sep="\t", header=None) - df_meta.rename(columns=dict(zip( df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"] )), inplace=True) + df_meta.rename( + columns=dict(zip(df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"])), + inplace=True, + ) logger.info(f"Input spaceranger file list {input_filelist} contains:") logger.info(df_meta) - df_barcode = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, names=["combined_barcode"]) - df_barcode["sample_id"] = [x.split("_")[-1] for x in df_barcode.combined_barcode.values] - df_barcode["barcode"] = [x.split("_")[0] for x in df_barcode.combined_barcode.values] + df_barcode = pd.read_csv( + f"{snp_dir}/barcodes.txt", header=None, names=["combined_barcode"] + ) + df_barcode["sample_id"] = [ + x.split("_")[-1] for x in df_barcode.combined_barcode.values + ] + df_barcode["barcode"] = [ + x.split("_")[0] for x in df_barcode.combined_barcode.values + ] ##### read SNP count ##### cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz") cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz") unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True) - snp_barcodes = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"]) + snp_barcodes = pd.read_csv( + f"{snp_dir}/barcodes.txt", header=None, names=["barcodes"] + ) assert (len(alignment_files) == 0) or (len(alignment_files) + 1 == df_meta.shape[0]) ##### read anndata and coordinate ##### # add position adata = None - for i,sname in enumerate(df_meta.sample_id.values): + for i, sname in enumerate(df_meta.sample_id.values): # locate the corresponding rows in df_meta index = np.where(df_barcode["sample_id"] == sname)[0] df_this_barcode = copy.copy(df_barcode.iloc[index, :]) df_this_barcode.index = df_this_barcode.barcode # read adata count info - if Path(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5").exists(): - adatatmp = sc.read_10x_h5(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5") - elif Path(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad").exists(): - adatatmp = sc.read_h5ad(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad") + if Path( + f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5" + ).exists(): + adatatmp = sc.read_10x_h5( + f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5" + ) + elif Path( + f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad" + ).exists(): + adatatmp = sc.read_h5ad( + f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad" + ) else: - logging.error(f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!") + logging.error( + f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!" + ) adatatmp.layers["count"] = adatatmp.X.A # reorder anndata spots to have the same order as df_this_barcode - idx_argsort = pd.Categorical(adatatmp.obs.index, categories=list(df_this_barcode.barcode), ordered=True).argsort() + idx_argsort = pd.Categorical( + adatatmp.obs.index, categories=list(df_this_barcode.barcode), ordered=True + ).argsort() adatatmp = adatatmp[idx_argsort, :] # read position info - if Path(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv").exists(): - df_this_pos = pd.read_csv(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv", sep=",", header=0, \ - names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) - elif Path(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv").exists(): - df_this_pos = pd.read_csv(f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv", sep=",", header=None, \ - names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + if Path( + f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv" + ).exists(): + df_this_pos = pd.read_csv( + f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions.csv", + sep=",", + header=0, + names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"], + ) + elif Path( + f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv" + ).exists(): + df_this_pos = pd.read_csv( + f"{df_meta['spaceranger_dir'].iloc[i]}/spatial/tissue_positions_list.csv", + sep=",", + header=None, + names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"], + ) else: raise Exception("No spatial coordinate file!") df_this_pos = df_this_pos[df_this_pos.in_tissue == True] @@ -177,7 +282,9 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil df_this_pos = df_this_pos[df_this_pos.barcode.isin(shared_barcodes)] # # df_this_pos.barcode = pd.Categorical(df_this_pos.barcode, categories=list(df_this_barcode.barcode), ordered=True) - df_this_pos.barcode = pd.Categorical(df_this_pos.barcode, categories=list(adatatmp.obs.index), ordered=True) + df_this_pos.barcode = pd.Categorical( + df_this_pos.barcode, categories=list(adatatmp.obs.index), ordered=True + ) df_this_pos.sort_values(by="barcode", inplace=True) adatatmp.obsm["X_pos"] = np.vstack([df_this_pos.x, df_this_pos.y]).T adatatmp.obs["sample"] = sname @@ -197,7 +304,12 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil cell_snp_Ballele = cell_snp_Ballele[snp_barcodes.barcodes.isin(shared_barcodes), :] snp_barcodes = snp_barcodes[snp_barcodes.barcodes.isin(shared_barcodes)] adata = adata[adata.obs.index.isin(shared_barcodes), :] - adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ] + adata = adata[ + pd.Categorical( + adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True + ).argsort(), + :, + ] ##### load pairwise alignments ##### # TBD: directly convert to big "adjacency" matrix @@ -208,87 +320,132 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil col_ind = [] dat = [] offset = 0 - for i,f in enumerate(alignment_files): + for i, f in enumerate(alignment_files): pi = np.load(f) # normalize p such that max( rowsum(pi), colsum(pi) ) = 1, max alignment weight = 1 - pi = pi / np.max( np.append(np.sum(pi,axis=0), np.sum(pi,axis=1)) ) + pi = pi / np.max(np.append(np.sum(pi, axis=0), np.sum(pi, axis=1))) sname1 = df_meta.sample_id.values[i] - sname2 = df_meta.sample_id.values[i+1] - assert pi.shape[0] == np.sum(df_barcode["sample_id"] == sname1) # double check whether this is correct - assert pi.shape[1] == np.sum(df_barcode["sample_id"] == sname2) # or the dimension should be flipped + sname2 = df_meta.sample_id.values[i + 1] + assert pi.shape[0] == np.sum( + df_barcode["sample_id"] == sname1 + ) # double check whether this is correct + assert pi.shape[1] == np.sum( + df_barcode["sample_id"] == sname2 + ) # or the dimension should be flipped # for each spot s in sname1, select {t: spot t in sname2 and pi[s,t] >= np.max(pi[s,:])} as the corresponding spot in the other slice for row in range(pi.shape[0]): - cutoff = np.max(pi[row,:]) if np.max(pi[row,:]) > EPS else 1+EPS + cutoff = np.max(pi[row, :]) if np.max(pi[row, :]) > EPS else 1 + EPS list_cols = np.where(pi[row, :] >= cutoff - EPS)[0] row_ind += [offset + row] * len(list_cols) - col_ind += list( offset + pi.shape[0] + list_cols ) + col_ind += list(offset + pi.shape[0] + list_cols) dat += list(pi[row, list_cols]) offset += pi.shape[0] - across_slice_adjacency_mat = scipy.sparse.csr_matrix((dat, (row_ind, col_ind) ), shape=(adata.shape[0], adata.shape[0])) + across_slice_adjacency_mat = scipy.sparse.csr_matrix( + (dat, (row_ind, col_ind)), shape=(adata.shape[0], adata.shape[0]) + ) across_slice_adjacency_mat += across_slice_adjacency_mat.T - + # filter out spots with too small number of UMIs - indicator = (np.sum(adata.layers["count"], axis=1) >= min_snpumis) + indicator = np.sum(adata.layers["count"], axis=1) >= min_snpumis adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] if not (across_slice_adjacency_mat is None): - across_slice_adjacency_mat = across_slice_adjacency_mat[indicator,:][:,indicator] + across_slice_adjacency_mat = across_slice_adjacency_mat[indicator, :][ + :, indicator + ] # filter out spots with too small number of SNP-covering UMIs - indicator = ( np.sum(cell_snp_Aallele, axis=1).A.flatten() + np.sum(cell_snp_Ballele, axis=1).A.flatten() >= min_snpumis ) + indicator = ( + np.sum(cell_snp_Aallele, axis=1).A.flatten() + + np.sum(cell_snp_Ballele, axis=1).A.flatten() + >= min_snpumis + ) adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] if not (across_slice_adjacency_mat is None): - across_slice_adjacency_mat = across_slice_adjacency_mat[indicator,:][:,indicator] + across_slice_adjacency_mat = across_slice_adjacency_mat[indicator, :][ + :, indicator + ] # filter out genes that are expressed in 0, axis=0) >= min_percent_expressed_spots * adata.shape[0]).A.flatten() + indicator = ( + np.sum(adata.X > 0, axis=0) >= min_percent_expressed_spots * adata.shape[0] + ).A.flatten() genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] print(adata) - print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print( + "median UMI after filtering out genes < 0.5% of cells = {}".format( + np.median(np.sum(adata.layers["count"], axis=1)) + ) + ) if not filtergenelist_file is None: filter_gene_list = pd.read_csv(filtergenelist_file, header=None) - filter_gene_list = set(list( filter_gene_list.iloc[:,0] )) - indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ]) + filter_gene_list = set(list(filter_gene_list.iloc[:, 0])) + indicator_filter = np.array( + [(not x in filter_gene_list) for x in adata.var.index] + ) adata = adata[:, indicator_filter] - print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print( + "median UMI after filtering out genes in filtergenelist_file = {}".format( + np.median(np.sum(adata.layers["count"], axis=1)) + ) + ) if not filterregion_file is None: - regions = pd.read_csv(filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"]) + regions = pd.read_csv( + filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"] + ) if "chr" in regions.Chrname.iloc[0]: regions["CHR"] = [int(x[3:]) for x in regions.Chrname.values] else: - regions.rename(columns={'Chrname':'CHR'}, inplace=True) + regions.rename(columns={"Chrname": "CHR"}, inplace=True) regions.sort_values(by=["CHR", "Start"], inplace=True) indicator_filter = np.array([True] * cell_snp_Aallele.shape[1]) j = 0 for i in range(cell_snp_Aallele.shape[1]): this_chr = int(unique_snp_ids[i].split("_")[0]) this_pos = int(unique_snp_ids[i].split("_")[1]) - while j < regions.shape[0] and ( (regions.CHR.values[j] < this_chr) or ((regions.CHR.values[j] == this_chr) and (regions.End.values[j] <= this_pos)) ): + while j < regions.shape[0] and ( + (regions.CHR.values[j] < this_chr) + or ( + (regions.CHR.values[j] == this_chr) + and (regions.End.values[j] <= this_pos) + ) + ): j += 1 - if j < regions.shape[0] and (regions.CHR.values[j] == this_chr) and (regions.Start.values[j] <= this_pos) and (regions.End.values[j] > this_pos): + if ( + j < regions.shape[0] + and (regions.CHR.values[j] == this_chr) + and (regions.Start.values[j] <= this_pos) + and (regions.End.values[j] > this_pos) + ): indicator_filter[i] = False cell_snp_Aallele = cell_snp_Aallele[:, indicator_filter] cell_snp_Ballele = cell_snp_Ballele[:, indicator_filter] unique_snp_ids = unique_snp_ids[indicator_filter] - + clf = LocalOutlierFactor(n_neighbors=200) - label = clf.fit_predict( np.sum(adata.layers["count"], axis=0).reshape(-1,1) ) - adata.layers["count"][:, np.where(label==-1)[0]] = 0 - print("filter out {} outlier genes.".format( np.sum(label==-1) )) + label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1)) + adata.layers["count"][:, np.where(label == -1)[0]] = 0 + print("filter out {} outlier genes.".format(np.sum(label == -1))) if not normalidx_file is None: - normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:,0].values + normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values adata.obs["tumor_annotation"] = "tumor" adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal" - print( adata.obs["tumor_annotation"].value_counts() ) + print(adata.obs["tumor_annotation"].value_counts()) - return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids, across_slice_adjacency_mat + return ( + adata, + cell_snp_Aallele.A, + cell_snp_Ballele.A, + unique_snp_ids, + across_slice_adjacency_mat, + ) def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile): @@ -296,14 +453,19 @@ def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile): cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz") unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True) barcodes = pd.read_csv(f"{snp_dir}/barcodes.txt", header=None, index_col=None) - barcodes = barcodes.iloc[:,0].values + barcodes = barcodes.iloc[:, 0].values # add spatial position df_pos = pd.read_csv(bead_file, header=0, sep=",", index_col=None) coords = np.vstack([df_pos.xcoord, df_pos.ycoord]).T # remove SNPs within filterregion_bedfile if not filterregion_bedfile is None: - df_filter = pd.read_csv(filterregion_bedfile, header=None, sep="\t", names=["chrname", "start", "end"]) - df_filter = df_filter[df_filter.chrname.isin( [f"chr{i}" for i in range(1,23)] )] + df_filter = pd.read_csv( + filterregion_bedfile, + header=None, + sep="\t", + names=["chrname", "start", "end"], + ) + df_filter = df_filter[df_filter.chrname.isin([f"chr{i}" for i in range(1, 23)])] df_filter["CHR"] = [int(x[3:]) for x in df_filter.chrname] df_filter.sort_values(by=["CHR", "start"]) # check whether unique_snp_ids are within the regions in df_filter @@ -315,9 +477,15 @@ def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile): is_within_filterregion = [] j = 0 for i in range(len(unique_snp_ids)): - while (filter_chrs[j] < snp_chrs[i]) or ((filter_chrs[j] == snp_chrs[i]) and (filter_end[j] < snp_pos[i])): + while (filter_chrs[j] < snp_chrs[i]) or ( + (filter_chrs[j] == snp_chrs[i]) and (filter_end[j] < snp_pos[i]) + ): j += 1 - if filter_chrs[j] == snp_chrs[i] and filter_start[j] <= snp_pos[i] and filter_end[j] >= snp_pos[i]: + if ( + filter_chrs[j] == snp_chrs[i] + and filter_start[j] <= snp_pos[i] + and filter_end[j] >= snp_pos[i] + ): is_within_filterregion.append(True) else: is_within_filterregion.append(False) @@ -329,45 +497,88 @@ def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile): return coords, cell_snp_Aallele, cell_snp_Ballele, barcodes, unique_snp_ids -def taking_shared_barcodes(snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, adata, df_pos): +def taking_shared_barcodes( + snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, adata, df_pos +): # shared barcodes between adata and SNPs - shared_barcodes = set(list(snp_barcodes.barcodes)) & set(list(adata.obs.index)) & set(list(df_pos.barcode)) + shared_barcodes = ( + set(list(snp_barcodes.barcodes)) + & set(list(adata.obs.index)) + & set(list(df_pos.barcode)) + ) cell_snp_Aallele = cell_snp_Aallele[snp_barcodes.barcodes.isin(shared_barcodes), :] cell_snp_Ballele = cell_snp_Ballele[snp_barcodes.barcodes.isin(shared_barcodes), :] snp_barcodes = snp_barcodes[snp_barcodes.barcodes.isin(shared_barcodes)] adata = adata[adata.obs.index.isin(shared_barcodes), :] - adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ] + adata = adata[ + pd.Categorical( + adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True + ).argsort(), + :, + ] df_pos = df_pos[df_pos.barcode.isin(shared_barcodes)] - df_pos = df_pos.iloc[ pd.Categorical(df_pos.barcode, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ] + df_pos = df_pos.iloc[ + pd.Categorical( + df_pos.barcode, categories=list(snp_barcodes.barcodes), ordered=True + ).argsort(), + :, + ] return snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, adata, df_pos -def filter_genes_barcodes_hatchetblock(adata, cell_snp_Aallele, cell_snp_Ballele, snp_barcodes, unique_snp_ids, config, min_umi=100, min_spot_percent=0.005, ordered_chr=[str(c) for c in range(1,23)]): +def filter_genes_barcodes_hatchetblock( + adata, + cell_snp_Aallele, + cell_snp_Ballele, + snp_barcodes, + unique_snp_ids, + config, + min_umi=100, + min_spot_percent=0.005, + ordered_chr=[str(c) for c in range(1, 23)], +): # filter out spots with too small number of UMIs - indicator = (np.sum(adata.layers["count"], axis=1) > min_umi) + indicator = np.sum(adata.layers["count"], axis=1) > min_umi adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] # filter out genes that are expressed in <0.5% cells - indicator = (np.sum(adata.X > 0, axis=0) >= min_spot_percent * adata.shape[0]).A.flatten() + indicator = ( + np.sum(adata.X > 0, axis=0) >= min_spot_percent * adata.shape[0] + ).A.flatten() genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] print(adata) - print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print( + "median UMI after filtering out genes < 0.5% of cells = {}".format( + np.median(np.sum(adata.layers["count"], axis=1)) + ) + ) if not config["filtergenelist_file"] is None: filter_gene_list = pd.read_csv(config["filtergenelist_file"], header=None) - filter_gene_list = set(list( filter_gene_list.iloc[:,0] )) - indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ]) + filter_gene_list = set(list(filter_gene_list.iloc[:, 0])) + indicator_filter = np.array( + [(not x in filter_gene_list) for x in adata.var.index] + ) adata = adata[:, indicator_filter] - print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print( + "median UMI after filtering out genes in filtergenelist_file = {}".format( + np.median(np.sum(adata.layers["count"], axis=1)) + ) + ) if not config["filterregion_file"] is None: - regions = pd.read_csv(config["filterregion_file"], header=None, sep="\t", names=["Chrname", "Start", "End"]) - ordered_chr_map = {ordered_chr[i]:i for i in range(len(ordered_chr))} + regions = pd.read_csv( + config["filterregion_file"], + header=None, + sep="\t", + names=["Chrname", "Start", "End"], + ) + ordered_chr_map = {ordered_chr[i]: i for i in range(len(ordered_chr))} # retain only chromosomes in ordered_chr - if ~np.any( regions.Chrname.isin(ordered_chr) ): + if ~np.any(regions.Chrname.isin(ordered_chr)): regions["Chrname"] = regions.Chrname.map(lambda x: x.replace("chr", "")) regions = regions[regions.Chrname.isin(ordered_chr)] regions["int_chrom"] = regions.Chrname.map(ordered_chr_map) @@ -377,9 +588,20 @@ def filter_genes_barcodes_hatchetblock(adata, cell_snp_Aallele, cell_snp_Ballele for i in range(cell_snp_Aallele.shape[1]): this_chr = int(unique_snp_ids[i].split("_")[0]) this_pos = int(unique_snp_ids[i].split("_")[1]) - while j < regions.shape[0] and ( (regions.int_chrom.values[j] < this_chr) or ((regions.int_chrom.values[j] == this_chr) and (regions.End.values[j] <= this_pos)) ): + while j < regions.shape[0] and ( + (regions.int_chrom.values[j] < this_chr) + or ( + (regions.int_chrom.values[j] == this_chr) + and (regions.End.values[j] <= this_pos) + ) + ): j += 1 - if j < regions.shape[0] and (regions.int_chrom.values[j] == this_chr) and (regions.Start.values[j] <= this_pos) and (regions.End.values[j] > this_pos): + if ( + j < regions.shape[0] + and (regions.int_chrom.values[j] == this_chr) + and (regions.Start.values[j] <= this_pos) + and (regions.End.values[j] > this_pos) + ): indicator_filter[i] = False cell_snp_Aallele = cell_snp_Aallele[:, indicator_filter] cell_snp_Ballele = cell_snp_Ballele[:, indicator_filter] @@ -393,7 +615,7 @@ def read_bias_correction_info(bc_file): df_info = pd.read_csv(bc_file, header=None, sep="\t") except: df_info = pd.read_csv(bc_file, header=0, sep="\t") - return df_info.iloc[:,-1].values + return df_info.iloc[:, -1].values def binning_readcount_using_SNP(df_bins, sorted_chr_pos_first): @@ -413,37 +635,62 @@ def binning_readcount_using_SNP(df_bins, sorted_chr_pos_first): # move the cursort on sorted_chr_pos_first such that the chr matches that in df_bins while this_chr != sorted_chr_pos_first[idx][0]: idx += 1 - while idx + 1 < len(sorted_chr_pos_first) and this_chr == sorted_chr_pos_first[idx+1][0] and mid > sorted_chr_pos_first[idx+1][1]: + while ( + idx + 1 < len(sorted_chr_pos_first) + and this_chr == sorted_chr_pos_first[idx + 1][0] + and mid > sorted_chr_pos_first[idx + 1][1] + ): idx += 1 multiplier[i, idx] = 1 return multiplier - -def load_slidedna_readcount(countfile, bead_file, binfile, normalfile, bias_correction_filelist, retained_barcodes, retain_chr_list=np.arange(1,23)): + +def load_slidedna_readcount( + countfile, + bead_file, + binfile, + normalfile, + bias_correction_filelist, + retained_barcodes, + retain_chr_list=np.arange(1, 23), +): # load counts and the corresponding barcodes per spot in counts tmpcounts = np.loadtxt(countfile) - counts = scipy.sparse.csr_matrix(( tmpcounts[:,2], (tmpcounts[:,0].astype(int)-1, tmpcounts[:,1].astype(int)-1) )) + counts = scipy.sparse.csr_matrix( + ( + tmpcounts[:, 2], + (tmpcounts[:, 0].astype(int) - 1, tmpcounts[:, 1].astype(int) - 1), + ) + ) tmpdf = pd.read_csv(bead_file, header=0, sep=",", index_col=0) - tmpdf = tmpdf.join( pd.DataFrame(counts.A, index=tmpdf.index)) + tmpdf = tmpdf.join(pd.DataFrame(counts.A, index=tmpdf.index)) # keep only the spots in retained_barcodes tmpdf = tmpdf[tmpdf.index.isin(retained_barcodes)] # reorder by retained_barcodes - tmpdf.index = pd.Categorical(tmpdf.index, categories=retained_barcodes, ordered=True) + tmpdf.index = pd.Categorical( + tmpdf.index, categories=retained_barcodes, ordered=True + ) tmpdf.sort_index(inplace=True) counts = tmpdf.values[:, 2:] # load normal counts - normal_cov = pd.read_csv(normalfile, header=None, sep="\t").values[:,-1].astype(float) + normal_cov = ( + pd.read_csv(normalfile, header=None, sep="\t").values[:, -1].astype(float) + ) # load bin info df_bins = pd.read_csv(binfile, comment="#", header=None, index_col=None, sep="\t") old_names = df_bins.columns[:3] df_bins.rename(columns=dict(zip(old_names, ["CHR", "START", "END"])), inplace=True) - + # select bins according to retain_chr_list - retain_chr_list_append = list(retain_chr_list) + [str(x) for x in retain_chr_list] + [f"chr{x}" for x in retain_chr_list] + retain_chr_list_append = ( + list(retain_chr_list) + + [str(x) for x in retain_chr_list] + + [f"chr{x}" for x in retain_chr_list] + ) bidx = np.where(df_bins.CHR.isin(retain_chr_list_append))[0] - df_bins = df_bins.iloc[bidx,:] + df_bins = df_bins.iloc[bidx, :] counts = counts[:, bidx] normal_cov = normal_cov[bidx] @@ -458,40 +705,75 @@ def load_slidedna_readcount(countfile, bead_file, binfile, normalfile, bias_corr bias_features = [] for f in bias_correction_filelist: this_feature = read_bias_correction_info(f) - bias_features.append( this_feature[bidx] ) + bias_features.append(this_feature[bidx]) bias_features = np.array(bias_features).T # kernel ridge regression to predict the read count per bin # the prediction serves as a baseline of the expected read count, and plays a role in base_nb_mean krr = KernelRidge(alpha=0.2) - krr.fit( bias_features, np.sum(counts, axis=0) / np.sum(counts) ) - pred = krr.predict( bias_features ) + krr.fit(bias_features, np.sum(counts, axis=0) / np.sum(counts)) + pred = krr.predict(bias_features) # single_base_nb_mean from bias correction + expected normal - single_base_nb_mean = (pred * normal_cov).reshape(-1,1) / np.sum(pred * normal_cov) * np.sum(counts, axis=1).reshape(1,-1) + single_base_nb_mean = ( + (pred * normal_cov).reshape(-1, 1) + / np.sum(pred * normal_cov) + * np.sum(counts, axis=1).reshape(1, -1) + ) # single_base_nb_mean = pred.reshape(-1,1) / np.sum(pred) * np.sum(counts, axis=1).reshape(1,-1) # remove too low baseline - threshold = np.median( np.sum(single_base_nb_mean, axis=1) / df_bins.iloc[:,3].values.astype(float) ) * 0.5 - idx_filter = np.where( np.sum(single_base_nb_mean, axis=1) / df_bins.iloc[:,3].values.astype(float) < threshold )[0] + threshold = ( + np.median( + np.sum(single_base_nb_mean, axis=1) + / df_bins.iloc[:, 3].values.astype(float) + ) + * 0.5 + ) + idx_filter = np.where( + np.sum(single_base_nb_mean, axis=1) / df_bins.iloc[:, 3].values.astype(float) + < threshold + )[0] single_base_nb_mean[idx_filter, :] = 0 counts[:, idx_filter] = 0 return counts, single_base_nb_mean, df_bins, normal_cov - -def get_slidednaseq_rdr(countfile, bead_file, binfile, normalfile, bias_correction_filelist, retained_barcodes, sorted_chr_pos_first, single_X, single_base_nb_mean, retain_chr_list=np.arange(1,23)): - counts, single_base_nb_mean, df_bins, _ = load_slidedna_readcount(countfile, bead_file, binfile, normalfile, bias_correction_filelist, retained_barcodes) + +def get_slidednaseq_rdr( + countfile, + bead_file, + binfile, + normalfile, + bias_correction_filelist, + retained_barcodes, + sorted_chr_pos_first, + single_X, + single_base_nb_mean, + retain_chr_list=np.arange(1, 23), +): + counts, single_base_nb_mean, df_bins, _ = load_slidedna_readcount( + countfile, + bead_file, + binfile, + normalfile, + bias_correction_filelist, + retained_barcodes, + ) # remove bins with low-coverage single_base_nb_mean - + multiplier = binning_readcount_using_SNP(df_bins, sorted_chr_pos_first) - single_X[:,0,:] = multiplier.T @ counts.T + single_X[:, 0, :] = multiplier.T @ counts.T single_base_nb_mean = multiplier.T @ single_base_nb_mean return single_X, single_base_nb_mean -def filter_slidedna_spot_by_adjacency(coords, cell_snp_Aallele, cell_snp_Ballele, barcodes): +def filter_slidedna_spot_by_adjacency( + coords, cell_snp_Aallele, cell_snp_Ballele, barcodes +): # distance to center - dist = np.sqrt(np.sum(np.square(coords - np.median(coords, axis=0, keepdims=True)), axis=1)) + dist = np.sqrt( + np.sum(np.square(coords - np.median(coords, axis=0, keepdims=True)), axis=1) + ) idx_keep = np.where(dist < 2500)[0] # remove spots coords = coords[idx_keep, :] @@ -504,15 +786,38 @@ def filter_slidedna_spot_by_adjacency(coords, cell_snp_Aallele, cell_snp_Ballele def combine_gene_snps(unique_snp_ids, hgtable_file, adata): # read gene info and keep only chr1-chr22 and genes appearing in adata df_hgtable = pd.read_csv(hgtable_file, header=0, index_col=0, sep="\t") - df_hgtable = df_hgtable[df_hgtable.chrom.isin( [f"chr{i}" for i in range(1, 23)] )] + df_hgtable = df_hgtable[df_hgtable.chrom.isin([f"chr{i}" for i in range(1, 23)])] df_hgtable = df_hgtable[df_hgtable.name2.isin(adata.var.index)] # a data frame including both gene and SNP info: CHR, START, END, snp_id, gene, is_interval - df_gene_snp = pd.DataFrame({"CHR":[int(x[3:]) for x in df_hgtable.chrom.values], "START":df_hgtable.cdsStart.values, "END":df_hgtable.cdsEnd.values, \ - "snp_id":None, "gene":df_hgtable.name2.values, "is_interval":True}) + df_gene_snp = pd.DataFrame( + { + "CHR": [int(x[3:]) for x in df_hgtable.chrom.values], + "START": df_hgtable.cdsStart.values, + "END": df_hgtable.cdsEnd.values, + "snp_id": None, + "gene": df_hgtable.name2.values, + "is_interval": True, + } + ) # add SNP info snp_chr = np.array([int(x.split("_")[0]) for x in unique_snp_ids]) snp_pos = np.array([int(x.split("_")[1]) for x in unique_snp_ids]) - df_gene_snp = pd.concat([df_gene_snp, pd.DataFrame({"CHR":snp_chr, "START":snp_pos, "END":snp_pos+1, "snp_id":unique_snp_ids, "gene":None, "is_interval":False}) ], ignore_index=True) + df_gene_snp = pd.concat( + [ + df_gene_snp, + pd.DataFrame( + { + "CHR": snp_chr, + "START": snp_pos, + "END": snp_pos + 1, + "snp_id": unique_snp_ids, + "gene": None, + "is_interval": False, + } + ), + ], + ignore_index=True, + ) df_gene_snp.sort_values(by=["CHR", "START"], inplace=True) # check the what gene each SNP belongs to @@ -526,18 +831,29 @@ def combine_gene_snps(unique_snp_ids, hgtable_file, adata): continue this_pos = vec_start[i] j = i - 1 - while j >= 0 and j >= i-50 and vec_chr[i] == vec_chr[j]: - if vec_is_interval[j] and vec_start[j] <= this_pos and vec_end[j] > this_pos: + while j >= 0 and j >= i - 50 and vec_chr[i] == vec_chr[j]: + if ( + vec_is_interval[j] + and vec_start[j] <= this_pos + and vec_end[j] > this_pos + ): df_gene_snp.iloc[i, 4] = df_gene_snp.iloc[j]["gene"] break j -= 1 - + # remove SNPs that have no corresponding genes df_gene_snp = df_gene_snp[~df_gene_snp.gene.isnull()] return df_gene_snp -def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, initial_min_umi=15): +def create_haplotype_block_ranges( + df_gene_snp, + adata, + cell_snp_Aallele, + cell_snp_Ballele, + unique_snp_ids, + initial_min_umi=15, +): """ Initially block SNPs along genome. @@ -547,73 +863,124 @@ def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp Gene and SNP info combined into a single data frame sorted by genomic positions. "is_interval" suggest whether the entry is a gene or a SNP. "gene" column either contain gene name if the entry is a gene, or the gene a SNP belongs to if the entry is a SNP. """ # first level: partition of genome: by gene regions (if two genes overlap, they are grouped to one region) - tmp_block_genome_intervals = list(zip( df_gene_snp[df_gene_snp.is_interval].CHR.values, df_gene_snp[df_gene_snp.is_interval].START.values, df_gene_snp[df_gene_snp.is_interval].END.values )) + tmp_block_genome_intervals = list( + zip( + df_gene_snp[df_gene_snp.is_interval].CHR.values, + df_gene_snp[df_gene_snp.is_interval].START.values, + df_gene_snp[df_gene_snp.is_interval].END.values, + ) + ) block_genome_intervals = [tmp_block_genome_intervals[0]] for x in tmp_block_genome_intervals[1:]: # check whether overlap with previous block - if x[0] == block_genome_intervals[-1][0] and max(x[1], block_genome_intervals[-1][1]) < min(x[2], block_genome_intervals[-1][2]): - block_genome_intervals[-1] = (x[0], min(x[1], block_genome_intervals[-1][1]), max(x[2], block_genome_intervals[-1][2])) + if x[0] == block_genome_intervals[-1][0] and max( + x[1], block_genome_intervals[-1][1] + ) < min(x[2], block_genome_intervals[-1][2]): + block_genome_intervals[-1] = ( + x[0], + min(x[1], block_genome_intervals[-1][1]), + max(x[2], block_genome_intervals[-1][2]), + ) else: block_genome_intervals.append(x) # get block_ranges in the index of df_gene_snp block_ranges = [] for x in block_genome_intervals: - indexes = np.where((df_gene_snp.CHR.values == x[0]) & \ - (np.maximum(df_gene_snp.START.values, x[1]) < np.minimum(df_gene_snp.END.values, x[2])) )[0] - block_ranges.append( (indexes[0], indexes[-1]+1) ) - assert np.all( np.array(np.array([x[1] for x in block_ranges[:-1]])) == np.array(np.array([x[0] for x in block_ranges[1:]])) ) + indexes = np.where( + (df_gene_snp.CHR.values == x[0]) + & ( + np.maximum(df_gene_snp.START.values, x[1]) + < np.minimum(df_gene_snp.END.values, x[2]) + ) + )[0] + block_ranges.append((indexes[0], indexes[-1] + 1)) + assert np.all( + np.array(np.array([x[1] for x in block_ranges[:-1]])) + == np.array(np.array([x[0] for x in block_ranges[1:]])) + ) # record the initial block id in df_gene_snps df_gene_snp["initial_block_id"] = 0 - for i,x in enumerate(block_ranges): - df_gene_snp.iloc[x[0]:x[1], -1] = i + for i, x in enumerate(block_ranges): + df_gene_snp.iloc[x[0] : x[1], -1] = i # second level: group the first level blocks into haplotype blocks such that the minimum SNP-covering UMI counts >= initial_min_umi - map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)} - initial_block_chr = df_gene_snp.CHR.values[ np.array([x[0] for x in block_ranges]) ] + map_snp_index = {x: i for i, x in enumerate(unique_snp_ids)} + initial_block_chr = df_gene_snp.CHR.values[np.array([x[0] for x in block_ranges])] block_ranges_new = [] s = 0 while s < len(block_ranges): t = s while t <= len(block_ranges): t += 1 - reach_end = (t == len(block_ranges)) - change_chr = (initial_block_chr[s] != initial_block_chr[t-1]) + reach_end = t == len(block_ranges) + change_chr = initial_block_chr[s] != initial_block_chr[t - 1] # count SNP-covering UMI - involved_snps_ids = df_gene_snp[ (df_gene_snp.initial_block_id>=s) & (df_gene_snp.initial_block_id= s) & (df_gene_snp.initial_block_id < t) + ].snp_id involved_snps_ids = involved_snps_ids[~involved_snps_ids.isnull()].values involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids]) - this_snp_umis = 0 if len(involved_snp_idx) == 0 else np.sum(cell_snp_Aallele[:, involved_snp_idx]) + np.sum(cell_snp_Ballele[:, involved_snp_idx]) + this_snp_umis = ( + 0 + if len(involved_snp_idx) == 0 + else np.sum(cell_snp_Aallele[:, involved_snp_idx]) + + np.sum(cell_snp_Ballele[:, involved_snp_idx]) + ) if reach_end: break if change_chr: t -= 1 # re-count SNP-covering UMIs - involved_snps_ids = df_gene_snp.snp_id.iloc[block_ranges[s][0]:block_ranges[t-1][1]] - involved_snps_ids = involved_snps_ids[~involved_snps_ids.isnull()].values - involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids]) - this_snp_umis = 0 if len(involved_snp_idx) == 0 else np.sum( cell_snp_Aallele[:, involved_snp_idx]) + np.sum(cell_snp_Ballele[:, involved_snp_idx]) + involved_snps_ids = df_gene_snp.snp_id.iloc[ + block_ranges[s][0] : block_ranges[t - 1][1] + ] + involved_snps_ids = involved_snps_ids[ + ~involved_snps_ids.isnull() + ].values + involved_snp_idx = np.array( + [map_snp_index[x] for x in involved_snps_ids] + ) + this_snp_umis = ( + 0 + if len(involved_snp_idx) == 0 + else np.sum(cell_snp_Aallele[:, involved_snp_idx]) + + np.sum(cell_snp_Ballele[:, involved_snp_idx]) + ) break if this_snp_umis >= initial_min_umi: break # - if this_snp_umis < initial_min_umi and s > 0 and initial_block_chr[s-1] == initial_block_chr[s]: + if ( + this_snp_umis < initial_min_umi + and s > 0 + and initial_block_chr[s - 1] == initial_block_chr[s] + ): indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0] - block_ranges_new[-1] = (block_ranges_new[-1][0], indexes[-1]+1) + block_ranges_new[-1] = (block_ranges_new[-1][0], indexes[-1] + 1) else: indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0] - block_ranges_new.append( (indexes[0], indexes[-1]+1) ) + block_ranges_new.append((indexes[0], indexes[-1] + 1)) s = t - + # record the block id in df_gene_snps df_gene_snp["block_id"] = 0 - for i,x in enumerate(block_ranges_new): - df_gene_snp.iloc[x[0]:x[1], -1] = i + for i, x in enumerate(block_ranges_new): + df_gene_snp.iloc[x[0] : x[1], -1] = i df_gene_snp = df_gene_snp.drop(columns=["initial_block_id"]) return df_gene_snp -def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu, logphase_shift, geneticmap_file): +def summarize_counts_for_blocks( + df_gene_snp, + adata, + cell_snp_Aallele, + cell_snp_Ballele, + unique_snp_ids, + nu, + logphase_shift, + geneticmap_file, +): """ Attributes: ---------- @@ -630,7 +997,7 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B single_base_nb_mean : array, (n_blocks, n_spots) Baseline transcript counts in normal diploid per block per cell. - + single_total_bb_RD : array, (n_blocks, n_spots) Total allele count per block per cell. @@ -642,42 +1009,76 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B single_base_nb_mean = np.zeros((len(blocks), adata.shape[0])) single_total_bb_RD = np.zeros((len(blocks), adata.shape[0]), dtype=int) # summarize counts of involved genes and SNPs within each block - map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)} - df_block_contents = df_gene_snp.groupby('block_id').agg({"snp_id":list, "gene":list}) + map_snp_index = {x: i for i, x in enumerate(unique_snp_ids)} + df_block_contents = df_gene_snp.groupby("block_id").agg( + {"snp_id": list, "gene": list} + ) for b in range(df_block_contents.shape[0]): # BAF (SNPs) - involved_snps_ids = [x for x in df_block_contents.snp_id.values[b] if not x is None] + involved_snps_ids = [ + x for x in df_block_contents.snp_id.values[b] if not x is None + ] involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids]) if len(involved_snp_idx) > 0: - single_X[b, 1, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ) - single_total_bb_RD[b, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ) + np.sum( cell_snp_Ballele[:, involved_snp_idx], axis=1 ) + single_X[b, 1, :] = np.sum(cell_snp_Aallele[:, involved_snp_idx], axis=1) + single_total_bb_RD[b, :] = np.sum( + cell_snp_Aallele[:, involved_snp_idx], axis=1 + ) + np.sum(cell_snp_Ballele[:, involved_snp_idx], axis=1) # RDR (genes) - involved_genes = list(set([x for x in df_block_contents.gene.values[b] if not x is None])) + involved_genes = list( + set([x for x in df_block_contents.gene.values[b] if not x is None]) + ) if len(involved_genes) > 0: - single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + single_X[b, 0, :] = np.sum( + adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1 + ) # lengths lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int) - for i,c in enumerate( df_gene_snp.CHR.unique() ): - lengths[i] = len( df_gene_snp[df_gene_snp.CHR == c].block_id.unique() ) + for i, c in enumerate(df_gene_snp.CHR.unique()): + lengths[i] = len(df_gene_snp[df_gene_snp.CHR == c].block_id.unique()) # phase switch probability from genetic distance - sorted_chr_pos_first = df_gene_snp.groupby('block_id').agg({'CHR': 'first', 'START': 'first'}) - sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)) - sorted_chr_pos_last = df_gene_snp.groupby('block_id').agg({'CHR': 'last', 'END': 'last'}) - sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)) + sorted_chr_pos_first = df_gene_snp.groupby("block_id").agg( + {"CHR": "first", "START": "first"} + ) + sorted_chr_pos_first = list( + zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values) + ) + sorted_chr_pos_last = df_gene_snp.groupby("block_id").agg( + {"CHR": "last", "END": "last"} + ) + sorted_chr_pos_last = list( + zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values) + ) # - tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair] - position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file ) - phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu) - log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift) + tmp_sorted_chr_pos = [ + val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair + ] + position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file) + phase_switch_prob = compute_phase_switch_probability_position( + position_cM, tmp_sorted_chr_pos, nu + ) + log_sitewise_transmat = np.minimum( + np.log(0.5), np.log(phase_switch_prob) - logphase_shift + ) # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)] - log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)] - - return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat - - -def choose_umithreshold_given_nbins(single_total_bb_RD, refined_lengths, expected_nbins): + log_sitewise_transmat = log_sitewise_transmat[ + np.arange(1, len(log_sitewise_transmat), 2) + ] + + return ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + ) + + +def choose_umithreshold_given_nbins( + single_total_bb_RD, refined_lengths, expected_nbins +): def count_num_bins(per_snp_umi, refined_lengths, secondary_min_umi): cumlen = 0 s = 0 @@ -693,6 +1094,7 @@ def count_num_bins(per_snp_umi, refined_lengths, secondary_min_umi): s = t cumlen += le return bin_counter + per_snp_umi = np.sum(single_total_bb_RD, axis=1) # candicate range lo = np.sort(per_snp_umi)[-expected_nbins] @@ -711,7 +1113,25 @@ def count_num_bins(per_snp_umi, refined_lengths, secondary_min_umi): return mid -def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, binsize, rdrbinsize, nu, logphase_shift, geneticmap_file, secondary_min_umi=1000, max_binlength=5e6): +def perform_binning_new( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + sorted_chr_pos, + sorted_chr_pos_last, + x_gene_list, + n_snps, + phase_indicator, + refined_lengths, + binsize, + rdrbinsize, + nu, + logphase_shift, + geneticmap_file, + secondary_min_umi=1000, + max_binlength=5e6, +): per_snp_umi = np.sum(single_total_bb_RD, axis=1) # secondary_min_umi = np.percentile(per_snp_umi, secondary_percentile) # bin both RDR and BAF @@ -731,12 +1151,15 @@ def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_ t = s + 1 while t < cumlen + le and np.sum(per_snp_umi[s:t]) < secondary_min_umi: t += 1 - if sorted_chr_pos_last[t-1][1] - sorted_chr_pos[s][1] >= max_binlength: - t = max(t-1, s+1) + if ( + sorted_chr_pos_last[t - 1][1] - sorted_chr_pos[s][1] + >= max_binlength + ): + t = max(t - 1, s + 1) break # expand binsize by minimum number of genes - this_genes = sum([ x_gene_list[i].split(" ") for i in range(s,t) ], []) - this_genes = [z for z in this_genes if z!=""] + this_genes = sum([x_gene_list[i].split(" ") for i in range(s, t)], []) + this_genes = [z for z in this_genes if z != ""] idx_A = np.where(phase_indicator[s:t])[0] idx_B = np.where(~phase_indicator[s:t])[0] # if np.sum(per_snp_umi[s:t]) >= secondary_min_umi or sorted_chr_pos[s][0] != bin_sorted_chr_pos_last[-1][0]: @@ -749,36 +1172,65 @@ def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_ # bin_x_gene_list.append( " ".join(this_genes) ) # bin_n_snps.append( np.sum(n_snps[s:t]) ) # else: - # bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0) + # bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0) # bin_single_X_baf[-1] += np.sum(single_X[s:t, 1, :][idx_A,:], axis=0) + np.sum(single_total_bb_RD[s:t, :][idx_B,:] - single_X[s:t, 1, :][idx_B,:], axis=0) # bin_single_base_nb_mean[-1] += np.sum(single_base_nb_mean[s:t, :], axis=0) # bin_single_total_bb_RD[-1] += np.sum(single_total_bb_RD[s:t, :], axis=0) # bin_sorted_chr_pos_last[-1] = sorted_chr_pos_last[t-1] # if len(this_genes) > 0: # bin_x_gene_list[-1] += " " + " ".join(this_genes) - # bin_n_snps[-1] += np.sum(n_snps[s:t]) - if len(bin_sorted_chr_pos_last) > 0 and sorted_chr_pos[s][0] == bin_sorted_chr_pos_last[-1][0] and \ - np.sum(per_snp_umi[s:t]) < 0.5*secondary_min_umi and sorted_chr_pos_last[t-1][1] - sorted_chr_pos[s][1] < 0.5*max_binlength: - bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0) - bin_single_X_baf[-1] += np.sum(single_X[s:t, 1, :][idx_A,:], axis=0) + np.sum(single_total_bb_RD[s:t, :][idx_B,:] - single_X[s:t, 1, :][idx_B,:], axis=0) - bin_single_base_nb_mean[-1] += np.sum(single_base_nb_mean[s:t, :], axis=0) + # bin_n_snps[-1] += np.sum(n_snps[s:t]) + if ( + len(bin_sorted_chr_pos_last) > 0 + and sorted_chr_pos[s][0] == bin_sorted_chr_pos_last[-1][0] + and np.sum(per_snp_umi[s:t]) < 0.5 * secondary_min_umi + and sorted_chr_pos_last[t - 1][1] - sorted_chr_pos[s][1] + < 0.5 * max_binlength + ): + bin_single_X_rdr[-1] += np.sum(single_X[s:t, 0, :], axis=0) + bin_single_X_baf[-1] += np.sum( + single_X[s:t, 1, :][idx_A, :], axis=0 + ) + np.sum( + single_total_bb_RD[s:t, :][idx_B, :] + - single_X[s:t, 1, :][idx_B, :], + axis=0, + ) + bin_single_base_nb_mean[-1] += np.sum( + single_base_nb_mean[s:t, :], axis=0 + ) bin_single_total_bb_RD[-1] += np.sum(single_total_bb_RD[s:t, :], axis=0) - bin_sorted_chr_pos_last[-1] = sorted_chr_pos_last[t-1] + bin_sorted_chr_pos_last[-1] = sorted_chr_pos_last[t - 1] if len(this_genes) > 0: - bin_x_gene_list[-1] += " " + " ".join(this_genes) + bin_x_gene_list[-1] += " " + " ".join(this_genes) bin_n_snps[-1] += np.sum(n_snps[s:t]) else: - bin_single_X_rdr.append( np.sum(single_X[s:t, 0, :], axis=0) ) - bin_single_X_baf.append( np.sum(single_X[s:t, 1, :][idx_A,:], axis=0) + np.sum(single_total_bb_RD[s:t, :][idx_B,:] - single_X[s:t, 1, :][idx_B,:], axis=0) ) - bin_single_base_nb_mean.append( np.sum(single_base_nb_mean[s:t, :], axis=0) ) - bin_single_total_bb_RD.append( np.sum(single_total_bb_RD[s:t, :], axis=0) ) - bin_sorted_chr_pos_first.append( sorted_chr_pos[s] ) - bin_sorted_chr_pos_last.append( sorted_chr_pos_last[t-1] ) - bin_x_gene_list.append( " ".join(this_genes) ) - bin_n_snps.append( np.sum(n_snps[s:t]) ) + bin_single_X_rdr.append(np.sum(single_X[s:t, 0, :], axis=0)) + bin_single_X_baf.append( + np.sum(single_X[s:t, 1, :][idx_A, :], axis=0) + + np.sum( + single_total_bb_RD[s:t, :][idx_B, :] + - single_X[s:t, 1, :][idx_B, :], + axis=0, + ) + ) + bin_single_base_nb_mean.append( + np.sum(single_base_nb_mean[s:t, :], axis=0) + ) + bin_single_total_bb_RD.append( + np.sum(single_total_bb_RD[s:t, :], axis=0) + ) + bin_sorted_chr_pos_first.append(sorted_chr_pos[s]) + bin_sorted_chr_pos_last.append(sorted_chr_pos_last[t - 1]) + bin_x_gene_list.append(" ".join(this_genes)) + bin_n_snps.append(np.sum(n_snps[s:t])) s = t cumlen += le - single_X = np.stack([ np.vstack([bin_single_X_rdr[i], bin_single_X_baf[i]]) for i in range(len(bin_single_X_rdr)) ]) + single_X = np.stack( + [ + np.vstack([bin_single_X_rdr[i], bin_single_X_baf[i]]) + for i in range(len(bin_single_X_rdr)) + ] + ) single_base_nb_mean = np.vstack(bin_single_base_nb_mean) single_total_bb_RD = np.vstack(bin_single_total_bb_RD) sorted_chr_pos_first = bin_sorted_chr_pos_first @@ -787,44 +1239,66 @@ def perform_binning_new(lengths, single_X, single_base_nb_mean, single_total_bb_ n_snps = bin_n_snps # phase switch probability from genetic distance - tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair] + tmp_sorted_chr_pos = [ + val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair + ] sorted_chr = np.array([x[0] for x in tmp_sorted_chr_pos]) - position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file ) - phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu) + position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file) + phase_switch_prob = compute_phase_switch_probability_position( + position_cM, tmp_sorted_chr_pos, nu + ) log_sitewise_transmat = np.log(phase_switch_prob) - logphase_shift # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)] - log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)] + log_sitewise_transmat = log_sitewise_transmat[ + np.arange(1, len(log_sitewise_transmat), 2) + ] sorted_chr = np.array([x[0] for x in sorted_chr_pos_first]) unique_chrs = [sorted_chr[0]] for x in sorted_chr[1:]: if x != unique_chrs[-1]: - unique_chrs.append( x ) - lengths = np.array([ np.sum(sorted_chr == chrname) for chrname in unique_chrs ]) - + unique_chrs.append(x) + lengths = np.array([np.sum(sorted_chr == chrname) for chrname in unique_chrs]) + # bin RDR s = 0 while s < single_X.shape[0]: - t = s+1 - this_genes = sum([ x_gene_list[i].split(" ") for i in range(s,t) ], []) - this_genes = [z for z in this_genes if z!=""] + t = s + 1 + this_genes = sum([x_gene_list[i].split(" ") for i in range(s, t)], []) + this_genes = [z for z in this_genes if z != ""] while t < single_X.shape[0] and len(this_genes) < rdrbinsize: t += 1 - this_genes += x_gene_list[t-1].split(" ") - this_genes = [z for z in this_genes if z!=""] + this_genes += x_gene_list[t - 1].split(" ") + this_genes = [z for z in this_genes if z != ""] single_X[s, 0, :] = np.sum(single_X[s:t, 0, :], axis=0) - single_X[(s+1):t, 0, :] = 0 + single_X[(s + 1) : t, 0, :] = 0 single_base_nb_mean[s, :] = np.sum(single_base_nb_mean[s:t, :], axis=0) - single_base_nb_mean[(s+1):t, :] = 0 + single_base_nb_mean[(s + 1) : t, :] = 0 x_gene_list[s] = " ".join(this_genes) - for k in range(s+1,t): + for k in range(s + 1, t): x_gene_list[k] = "" s = t - return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos_first, sorted_chr_pos_last, x_gene_list, n_snps - - -def create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, secondary_min_umi, max_binlength=5e6): + return ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + sorted_chr_pos_first, + sorted_chr_pos_last, + x_gene_list, + n_snps, + ) + + +def create_bin_ranges( + df_gene_snp, + single_total_bb_RD, + refined_lengths, + secondary_min_umi, + max_binlength=5e6, +): """ Aggregate haplotype blocks to bins @@ -844,7 +1318,10 @@ def create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, secondar df_gene_snp : data frame, (CHR, START, END, snp_id, gene, is_interval, block_id, bin_id) The newly added bin_id column indicates which bin each gene or SNP belongs to. """ - def greedy_binning_nobreak(block_lengths, block_umi, secondary_min_umi, max_binlength): + + def greedy_binning_nobreak( + block_lengths, block_umi, secondary_min_umi, max_binlength + ): """ Returns ------- @@ -859,54 +1336,80 @@ def greedy_binning_nobreak(block_lengths, block_umi, secondary_min_umi, max_binl while t < len(block_lengths) and np.sum(block_umi[s:t]) < secondary_min_umi: t += 1 if np.sum(block_lengths[s:t]) >= max_binlength: - t = max(t-1, s+1) + t = max(t - 1, s + 1) break # check whether it is a very small bin in the end - if s > 0 and t == len(block_lengths) and np.sum(block_umi[s:t]) < 0.5*secondary_min_umi and np.sum(block_lengths[s:t]) < 0.5*max_binlength: + if ( + s > 0 + and t == len(block_lengths) + and np.sum(block_umi[s:t]) < 0.5 * secondary_min_umi + and np.sum(block_lengths[s:t]) < 0.5 * max_binlength + ): bin_ranges[-1][1] = t else: - bin_ranges.append( [s,t] ) + bin_ranges.append([s, t]) s = t bin_ids = np.zeros(len(block_lengths), dtype=int) - for i,x in enumerate(bin_ranges): - bin_ids[x[0]:x[1]] = i + for i, x in enumerate(bin_ranges): + bin_ids[x[0] : x[1]] = i return bin_ids - + # block lengths and block umis - sorted_chr_pos_both = df_gene_snp.groupby('block_id').agg({'CHR': 'first', 'START': 'first', 'END': 'last'}) + sorted_chr_pos_both = df_gene_snp.groupby("block_id").agg( + {"CHR": "first", "START": "first", "END": "last"} + ) block_lengths = sorted_chr_pos_both.END.values - sorted_chr_pos_both.START.values block_umi = np.sum(single_total_bb_RD, axis=1) n_blocks = len(block_lengths) - + # get a list of breakpoints where bin much break - breakpoints = np.concatenate([ np.cumsum(refined_lengths), np.where(block_lengths > max_binlength)[0], np.where(block_lengths > max_binlength)[0]+1 ]) - breakpoints =np.sort(np.unique(breakpoints)) + breakpoints = np.concatenate( + [ + np.cumsum(refined_lengths), + np.where(block_lengths > max_binlength)[0], + np.where(block_lengths > max_binlength)[0] + 1, + ] + ) + breakpoints = np.sort(np.unique(breakpoints)) # append 0 in the front of breakpoints so that each pair of adjacent breakpoints can be an input to greedy_binning_nobreak if breakpoints[0] != 0: - breakpoints = np.append( [0], breakpoints ) + breakpoints = np.append([0], breakpoints) assert np.all(breakpoints[:-1] < breakpoints[1:]) # loop over breakpoints and bin each block bin_ids = np.zeros(n_blocks, dtype=int) offset = 0 - for i in range(len(breakpoints)-1): + for i in range(len(breakpoints) - 1): b1 = breakpoints[i] - b2 = breakpoints[i+1] + b2 = breakpoints[i + 1] if b2 - b1 == 1: bin_ids[b1:b2] = offset offset += 1 else: - this_bin_ids = greedy_binning_nobreak(block_lengths[b1:b2], block_umi[b1:b2], secondary_min_umi, max_binlength) + this_bin_ids = greedy_binning_nobreak( + block_lengths[b1:b2], block_umi[b1:b2], secondary_min_umi, max_binlength + ) bin_ids[b1:b2] = offset + this_bin_ids offset += np.max(this_bin_ids) + 1 - + # append bin_ids to df_gene_snp - df_gene_snp["bin_id"] = df_gene_snp.block_id.map({i:x for i,x in enumerate(bin_ids)}) - + df_gene_snp["bin_id"] = df_gene_snp.block_id.map( + {i: x for i, x in enumerate(bin_ids)} + ) + return df_gene_snp -def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, phase_indicator, nu, logphase_shift, geneticmap_file): +def summarize_counts_for_bins( + df_gene_snp, + adata, + single_X, + single_total_bb_RD, + phase_indicator, + nu, + logphase_shift, + geneticmap_file, +): """ Attributes: ---------- @@ -923,7 +1426,7 @@ def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, single_base_nb_mean : array, (n_blocks, n_spots) Baseline transcript counts in normal diploid per block per cell. - + single_total_bb_RD : array, (n_blocks, n_spots) Total allele count per block per cell. @@ -935,62 +1438,127 @@ def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, bin_single_base_nb_mean = np.zeros((len(bins), adata.shape[0])) bin_single_total_bb_RD = np.zeros((len(bins), adata.shape[0]), dtype=int) # summarize counts of involved genes and SNPs within each block - df_bin_contents = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"block_id":set, "gene":set}) + df_bin_contents = ( + df_gene_snp[~df_gene_snp.bin_id.isnull()] + .groupby("bin_id") + .agg({"block_id": set, "gene": set}) + ) for b in range(df_bin_contents.shape[0]): # BAF (SNPs) - involved_blocks = [x for x in df_bin_contents.block_id.values[b] if not x is None] - this_phased = np.where(phase_indicator[involved_blocks].reshape(-1,1), single_X[involved_blocks, 1, :], single_total_bb_RD[involved_blocks, :] - single_X[involved_blocks, 1, :]) + involved_blocks = [ + x for x in df_bin_contents.block_id.values[b] if not x is None + ] + this_phased = np.where( + phase_indicator[involved_blocks].reshape(-1, 1), + single_X[involved_blocks, 1, :], + single_total_bb_RD[involved_blocks, :] - single_X[involved_blocks, 1, :], + ) bin_single_X[b, 1, :] = np.sum(this_phased, axis=0) - bin_single_total_bb_RD[b, :] = np.sum( single_total_bb_RD[involved_blocks, :], axis=0 ) + bin_single_total_bb_RD[b, :] = np.sum( + single_total_bb_RD[involved_blocks, :], axis=0 + ) # RDR (genes) involved_genes = [x for x in df_bin_contents.gene.values[b] if not x is None] - bin_single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + bin_single_X[b, 0, :] = np.sum( + adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1 + ) # lengths lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int) - for i,c in enumerate( df_gene_snp.CHR.unique() ): - lengths[i] = len( df_gene_snp[ (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull()) ].bin_id.unique() ) + for i, c in enumerate(df_gene_snp.CHR.unique()): + lengths[i] = len( + df_gene_snp[ + (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull()) + ].bin_id.unique() + ) # phase switch probability from genetic distance - sorted_chr_pos_first = df_gene_snp.groupby('bin_id').agg({'CHR': 'first', 'START': 'first'}) - sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)) - sorted_chr_pos_last = df_gene_snp.groupby('bin_id').agg({'CHR': 'last', 'END': 'last'}) - sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)) + sorted_chr_pos_first = df_gene_snp.groupby("bin_id").agg( + {"CHR": "first", "START": "first"} + ) + sorted_chr_pos_first = list( + zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values) + ) + sorted_chr_pos_last = df_gene_snp.groupby("bin_id").agg( + {"CHR": "last", "END": "last"} + ) + sorted_chr_pos_last = list( + zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values) + ) # - tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair] - position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file ) - phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu) - log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift) + tmp_sorted_chr_pos = [ + val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair + ] + position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file) + phase_switch_prob = compute_phase_switch_probability_position( + position_cM, tmp_sorted_chr_pos, nu + ) + log_sitewise_transmat = np.minimum( + np.log(0.5), np.log(phase_switch_prob) - logphase_shift + ) # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)] - log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)] - - return lengths, bin_single_X, bin_single_base_nb_mean, bin_single_total_bb_RD, log_sitewise_transmat - - -def bin_selection_basedon_normal(df_gene_snp, single_X, single_base_nb_mean, single_total_bb_RD, nu, logphase_shift, index_normal, geneticmap_file, confidence_interval=[0.05, 0.95], min_betabinom_tau=30): + log_sitewise_transmat = log_sitewise_transmat[ + np.arange(1, len(log_sitewise_transmat), 2) + ] + + return ( + lengths, + bin_single_X, + bin_single_base_nb_mean, + bin_single_total_bb_RD, + log_sitewise_transmat, + ) + + +def bin_selection_basedon_normal( + df_gene_snp, + single_X, + single_base_nb_mean, + single_total_bb_RD, + nu, + logphase_shift, + index_normal, + geneticmap_file, + confidence_interval=[0.05, 0.95], + min_betabinom_tau=30, +): """ Filter out bins that potential contain somatic mutations based on BAF of normal spots. """ # pool B allele counts for each bin across all normal spots tmpX = np.sum(single_X[:, 1, index_normal], axis=1) tmptotal_bb_RD = np.sum(single_total_bb_RD[:, index_normal], axis=1) - model = Weighted_BetaBinom(tmpX, np.ones(len(tmpX)), weights=np.ones(len(tmpX)), exposure=tmptotal_bb_RD) + model = Weighted_BetaBinom( + tmpX, np.ones(len(tmpX)), weights=np.ones(len(tmpX)), exposure=tmptotal_bb_RD + ) tmpres = model.fit(disp=0) tmpres.params[0] = 0.5 tmpres.params[-1] = max(tmpres.params[-1], min_betabinom_tau) # remove bins if normal B allele frequencies fall out of 5%-95% probability range - removal_indicator1 = (tmpX < scipy.stats.betabinom.ppf(confidence_interval[0], tmptotal_bb_RD, tmpres.params[0] * tmpres.params[1], (1-tmpres.params[0]) * tmpres.params[1])) - removal_indicator2 = (tmpX > scipy.stats.betabinom.ppf(confidence_interval[1], tmptotal_bb_RD, tmpres.params[0] * tmpres.params[1], (1-tmpres.params[0]) * tmpres.params[1])) - print( np.sum(removal_indicator1 | removal_indicator2) ) + removal_indicator1 = tmpX < scipy.stats.betabinom.ppf( + confidence_interval[0], + tmptotal_bb_RD, + tmpres.params[0] * tmpres.params[1], + (1 - tmpres.params[0]) * tmpres.params[1], + ) + removal_indicator2 = tmpX > scipy.stats.betabinom.ppf( + confidence_interval[1], + tmptotal_bb_RD, + tmpres.params[0] * tmpres.params[1], + (1 - tmpres.params[0]) * tmpres.params[1], + ) + print(np.sum(removal_indicator1 | removal_indicator2)) index_removal = np.where(removal_indicator1 | removal_indicator2)[0] index_remaining = np.where(~(removal_indicator1 | removal_indicator2))[0] # # change df_gene_snp col = np.where(df_gene_snp.columns == "bin_id")[0][0] - df_gene_snp.iloc[ np.where(df_gene_snp.bin_id.isin(index_removal))[0], col] = None + df_gene_snp.iloc[np.where(df_gene_snp.bin_id.isin(index_removal))[0], col] = None # remap bin_id to existing list - df_gene_snp['bin_id'] = df_gene_snp['bin_id'].map({x:i for i,x in enumerate(index_remaining)}) - df_gene_snp.bin_id = df_gene_snp.bin_id.astype('Int64') + df_gene_snp["bin_id"] = df_gene_snp["bin_id"].map( + {x: i for i, x in enumerate(index_remaining)} + ) + df_gene_snp.bin_id = df_gene_snp.bin_id.astype("Int64") # change the related data matrices single_X = single_X[index_remaining, :, :] @@ -999,26 +1567,61 @@ def bin_selection_basedon_normal(df_gene_snp, single_X, single_base_nb_mean, sin # lengths lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int) - for i,c in enumerate( df_gene_snp.CHR.unique() ): - lengths[i] = len( df_gene_snp[ (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull()) ].bin_id.unique() ) + for i, c in enumerate(df_gene_snp.CHR.unique()): + lengths[i] = len( + df_gene_snp[ + (df_gene_snp.CHR == c) & (~df_gene_snp.bin_id.isnull()) + ].bin_id.unique() + ) ## phase switch probability from genetic distance - sorted_chr_pos_first = df_gene_snp.groupby('bin_id').agg({'CHR': 'first', 'START': 'first'}) - sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)) - sorted_chr_pos_last = df_gene_snp.groupby('bin_id').agg({'CHR': 'last', 'END': 'last'}) - sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)) + sorted_chr_pos_first = df_gene_snp.groupby("bin_id").agg( + {"CHR": "first", "START": "first"} + ) + sorted_chr_pos_first = list( + zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values) + ) + sorted_chr_pos_last = df_gene_snp.groupby("bin_id").agg( + {"CHR": "last", "END": "last"} + ) + sorted_chr_pos_last = list( + zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values) + ) # - tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair] - position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file ) - phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu) - log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift) + tmp_sorted_chr_pos = [ + val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair + ] + position_cM = get_position_cM_table(tmp_sorted_chr_pos, geneticmap_file) + phase_switch_prob = compute_phase_switch_probability_position( + position_cM, tmp_sorted_chr_pos, nu + ) + log_sitewise_transmat = np.minimum( + np.log(0.5), np.log(phase_switch_prob) - logphase_shift + ) # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)] - log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)] - - return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp - - -def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold=4, quantile_threshold=80): + log_sitewise_transmat = log_sitewise_transmat[ + np.arange(1, len(log_sitewise_transmat), 2) + ] + + return ( + lengths, + single_X, + single_base_nb_mean, + single_total_bb_RD, + log_sitewise_transmat, + df_gene_snp, + ) + + +def filter_de_genes( + exp_counts, + x_gene_list, + normal_candidate, + sample_list=None, + sample_ids=None, + logfcthreshold=4, + quantile_threshold=80, +): adata = anndata.AnnData(exp_counts) adata.layers["count"] = exp_counts.values adata.obs["normal_candidate"] = normal_candidate @@ -1026,7 +1629,7 @@ def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None, map_gene_adatavar = {} map_gene_umi = {} list_gene_umi = np.sum(adata.layers["count"], axis=0) - for i,x in enumerate(adata.var.index): + for i, x in enumerate(adata.var.index): map_gene_adatavar[x] = i map_gene_umi[x] = list_gene_umi[i] # @@ -1034,18 +1637,20 @@ def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None, sample_list = [None] # filtered_out_set = set() - for s,sname in enumerate(sample_list): + for s, sname in enumerate(sample_list): if sname is None: index = np.arange(adata.shape[0]) else: index = np.where(sample_ids == s)[0] tmpadata = adata[index, :].copy() # - umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold ) + umi_threshold = np.percentile( + np.sum(tmpadata.layers["count"], axis=0), quantile_threshold + ) # sc.pp.filter_cells(tmpadata, min_genes=200) sc.pp.filter_genes(tmpadata, min_cells=10) - med = np.median( np.sum(tmpadata.layers["count"], axis=1) ) + med = np.median(np.sum(tmpadata.layers["count"], axis=1)) # sc.pp.normalize_total(tmpadata, target_sum=1e4) sc.pp.normalize_total(tmpadata, target_sum=med) sc.pp.log1p(tmpadata) @@ -1053,29 +1658,56 @@ def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None, sc.pp.pca(tmpadata, n_comps=4) kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"]) kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"]) - idx_kmeans_label = np.argmax(np.bincount( kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2 )) + idx_kmeans_label = np.argmax( + np.bincount(kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2) + ) clone = np.array(["normal"] * tmpadata.shape[0]) - clone[ (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "tumor" + clone[ + (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) + ] = "tumor" tmpadata.obs["clone"] = clone # end added - sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor"], reference="normal", method='wilcoxon') - genenames = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["names"] ]) - logfc = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ]) - geneumis = np.array([ map_gene_umi[x] for x in genenames]) - this_filtered_out_set = set(list(genenames[ (np.abs(logfc) > logfcthreshold) & (geneumis > umi_threshold) ])) + sc.tl.rank_genes_groups( + tmpadata, "clone", groups=["tumor"], reference="normal", method="wilcoxon" + ) + genenames = np.array([x[0] for x in tmpadata.uns["rank_genes_groups"]["names"]]) + logfc = np.array( + [x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"]] + ) + geneumis = np.array([map_gene_umi[x] for x in genenames]) + this_filtered_out_set = set( + list( + genenames[(np.abs(logfc) > logfcthreshold) & (geneumis > umi_threshold)] + ) + ) filtered_out_set = filtered_out_set | this_filtered_out_set print(f"Filter out {len(filtered_out_set)} DE genes") # new_single_X_rdr = np.zeros((len(x_gene_list), adata.shape[0])) - for i,x in enumerate(x_gene_list): + for i, x in enumerate(x_gene_list): g_list = [z for z in x.split() if z != ""] - idx_genes = np.array([ map_gene_adatavar[g] for g in g_list if (not g in filtered_out_set) and (g in map_gene_adatavar)]) + idx_genes = np.array( + [ + map_gene_adatavar[g] + for g in g_list + if (not g in filtered_out_set) and (g in map_gene_adatavar) + ] + ) if len(idx_genes) > 0: new_single_X_rdr[i, :] = np.sum(adata.layers["count"][:, idx_genes], axis=1) return new_single_X_rdr, filtered_out_set -def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold_u=2, logfcthreshold_t=4, quantile_threshold=80): +def filter_de_genes_tri( + exp_counts, + df_bininfo, + normal_candidate, + sample_list=None, + sample_ids=None, + logfcthreshold_u=2, + logfcthreshold_t=4, + quantile_threshold=80, +): """ Attributes ---------- @@ -1089,7 +1721,7 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No map_gene_adatavar = {} map_gene_umi = {} list_gene_umi = np.sum(adata.layers["count"], axis=0) - for i,x in enumerate(adata.var.index): + for i, x in enumerate(adata.var.index): map_gene_adatavar[x] = i map_gene_umi[x] = list_gene_umi[i] # @@ -1097,20 +1729,25 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No sample_list = [None] # filtered_out_set = set() - for s,sname in enumerate(sample_list): + for s, sname in enumerate(sample_list): if sname is None: index = np.arange(adata.shape[0]) else: index = np.where(sample_ids == s)[0] tmpadata = adata[index, :].copy() - if np.sum(tmpadata.layers["count"][tmpadata.obs["normal_candidate"], :]) < tmpadata.shape[1] * 10: + if ( + np.sum(tmpadata.layers["count"][tmpadata.obs["normal_candidate"], :]) + < tmpadata.shape[1] * 10 + ): continue # - umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold ) + umi_threshold = np.percentile( + np.sum(tmpadata.layers["count"], axis=0), quantile_threshold + ) # # sc.pp.filter_cells(tmpadata, min_genes=200) sc.pp.filter_genes(tmpadata, min_cells=10) - med = np.median( np.sum(tmpadata.layers["count"], axis=1) ) + med = np.median(np.sum(tmpadata.layers["count"], axis=1)) # sc.pp.normalize_total(tmpadata, target_sum=1e4) sc.pp.normalize_total(tmpadata, target_sum=med) sc.pp.log1p(tmpadata) @@ -1118,11 +1755,17 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No sc.pp.pca(tmpadata, n_comps=4) kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"]) kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"]) - idx_kmeans_label = np.argmax(np.bincount( kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2 )) + idx_kmeans_label = np.argmax( + np.bincount(kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2) + ) clone = np.array(["normal"] * tmpadata.shape[0]) - clone[ (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "tumor" + clone[ + (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) + ] = "tumor" ### third part ### - clone[ (kmeans_labels == idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "unsure" + clone[ + (kmeans_labels == idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) + ] = "unsure" tmpadata.obs["clone"] = clone # end added # sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor", "unsure"], reference="normal", method='wilcoxon') @@ -1136,21 +1779,48 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No # geneumis_u = np.array([ map_gene_umi[x] for x in genenames_u]) # this_filtered_out_set = set(list(genenames_t[ (np.abs(logfc_t) > logfcthreshold) & (geneumis_t > umi_threshold) ])) | set(list(genenames_u[ (np.abs(logfc_u) > logfcthreshold) & (geneumis_u > umi_threshold) ])) # - agg_counts = np.vstack([ np.sum(tmpadata.layers["count"][tmpadata.obs['clone']==c,:], axis=0) for c in ['normal', 'unsure', 'tumor'] ]) + agg_counts = np.vstack( + [ + np.sum(tmpadata.layers["count"][tmpadata.obs["clone"] == c, :], axis=0) + for c in ["normal", "unsure", "tumor"] + ] + ) agg_counts = agg_counts / np.sum(agg_counts, axis=1, keepdims=True) * 1e6 - geneumis = np.array([ map_gene_umi[x] for x in tmpadata.var.index]) - logfc_u = np.where( ((agg_counts[1,:]==0) | (agg_counts[0,:]==0)), 10, np.log2(agg_counts[1,:] / agg_counts[0,:]) ) - logfc_t = np.where( ((agg_counts[2,:]==0) | (agg_counts[0,:]==0)), 10, np.log2(agg_counts[2,:] / agg_counts[0,:]) ) - this_filtered_out_set = set(list(tmpadata.var.index[ (np.abs(logfc_u)>logfcthreshold_u) & (geneumis>umi_threshold) ])) | set(list(tmpadata.var.index[ (np.abs(logfc_t)>logfcthreshold_t) & (geneumis>umi_threshold) ])) + geneumis = np.array([map_gene_umi[x] for x in tmpadata.var.index]) + logfc_u = np.where( + ((agg_counts[1, :] == 0) | (agg_counts[0, :] == 0)), + 10, + np.log2(agg_counts[1, :] / agg_counts[0, :]), + ) + logfc_t = np.where( + ((agg_counts[2, :] == 0) | (agg_counts[0, :] == 0)), + 10, + np.log2(agg_counts[2, :] / agg_counts[0, :]), + ) + this_filtered_out_set = set( + list( + tmpadata.var.index[ + (np.abs(logfc_u) > logfcthreshold_u) & (geneumis > umi_threshold) + ] + ) + ) | set( + list( + tmpadata.var.index[ + (np.abs(logfc_t) > logfcthreshold_t) & (geneumis > umi_threshold) + ] + ) + ) filtered_out_set = filtered_out_set | this_filtered_out_set print(f"Filter out {len(filtered_out_set)} DE genes") # # remove genes that are in filtered_out_set new_single_X_rdr = np.zeros((df_bininfo.shape[0], adata.shape[0])) - for b,genestr in enumerate(df_bininfo.INCLUDED_GENES.values): + for b, genestr in enumerate(df_bininfo.INCLUDED_GENES.values): # RDR (genes) involved_genes = set(genestr.split(" ")) - filtered_out_set - new_single_X_rdr[b, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + new_single_X_rdr[b, :] = np.sum( + adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1 + ) return new_single_X_rdr, filtered_out_set @@ -1161,15 +1831,48 @@ def get_lengths_by_arm(sorted_chr_pos, centromere_file): """ # read and process centromere file unique_chrs = [f"chr{i}" for i in range(1, 23)] - df = pd.read_csv(centromere_file, sep="\t", header=None, index_col=None, names=["CHRNAME", "START", "END", "LABEL", "SOURCE"]) + df = pd.read_csv( + centromere_file, + sep="\t", + header=None, + index_col=None, + names=["CHRNAME", "START", "END", "LABEL", "SOURCE"], + ) df = df[df.CHRNAME.isin(unique_chrs)] df["CHR"] = [int(x[3:]) for x in df.CHRNAME] - df = df.groupby("CHR").agg({"CHRNAME":"first", "START":"min", "END":"min", "LABEL":"first", "SOURCE":"first"}) + df = df.groupby("CHR").agg( + { + "CHRNAME": "first", + "START": "min", + "END": "min", + "LABEL": "first", + "SOURCE": "first", + } + ) df.sort_index(inplace=True) # count lengths - mat_chr_pos = np.vstack([ np.array([x[0] for x in sorted_chr_pos]), np.array([x[1] for x in sorted_chr_pos]) ]).T - armlengths = sum([ [np.sum((mat_chr_pos[:,0] == df.index[i]) & (mat_chr_pos[:,1] <= df.END.iloc[i])), \ - np.sum((mat_chr_pos[:,0] == df.index[i]) & (mat_chr_pos[:,1] > df.END.iloc[i]))] for i in range(df.shape[0])], []) + mat_chr_pos = np.vstack( + [ + np.array([x[0] for x in sorted_chr_pos]), + np.array([x[1] for x in sorted_chr_pos]), + ] + ).T + armlengths = sum( + [ + [ + np.sum( + (mat_chr_pos[:, 0] == df.index[i]) + & (mat_chr_pos[:, 1] <= df.END.iloc[i]) + ), + np.sum( + (mat_chr_pos[:, 0] == df.index[i]) + & (mat_chr_pos[:, 1] > df.END.iloc[i]) + ), + ] + for i in range(df.shape[0]) + ], + [], + ) armlengths = np.array(armlengths, dtype=int) return armlengths @@ -1190,12 +1893,20 @@ def get_lengths_by_arm(sorted_chr_pos, centromere_file): def expand_df_cnv(df_cnv, binsize=2e5, fillmissing=True): # get CHR and its END - df_chr_end = df_cnv.groupby("CHR").agg({"END":"max"}).reset_index() + df_chr_end = df_cnv.groupby("CHR").agg({"END": "max"}).reset_index() # initialize df_expand as a dataframe containing CHR, START, END such that END-START = binsize df_expand = [] - for i,c in enumerate(df_chr_end.CHR.values): - df_expand.append( pd.DataFrame({"CHR":c, "START":np.arange(0, df_chr_end.END.values[i], binsize), "END":binsize + np.arange(0, df_chr_end.END.values[i], binsize)}) ) + for i, c in enumerate(df_chr_end.CHR.values): + df_expand.append( + pd.DataFrame( + { + "CHR": c, + "START": np.arange(0, df_chr_end.END.values[i], binsize), + "END": binsize + np.arange(0, df_chr_end.END.values[i], binsize), + } + ) + ) df_expand = pd.concat(df_expand, ignore_index=True) # find the index in df_cnv such that each entry in df_expand overlaps with the largest length @@ -1208,29 +1919,39 @@ def expand_df_cnv(df_cnv, binsize=2e5, fillmissing=True): for i, this_chr in enumerate(df_expand.CHR.values): this_start = df_expand.START.values[i] this_end = df_expand.END.values[i] - while j < df_cnv.shape[0] and (vec_cnv_chr[j] < this_chr or (vec_cnv_chr[j] == this_chr and vec_cnv_end[j] <= this_start)): + while j < df_cnv.shape[0] and ( + vec_cnv_chr[j] < this_chr + or (vec_cnv_chr[j] == this_chr and vec_cnv_end[j] <= this_start) + ): j += 1 # overlap length of the j-th segment to (j+3)-th segment in df_cnv overlap_lengths = [] - for k in range(j, min(j+3, df_cnv.shape[0])): + for k in range(j, min(j + 3, df_cnv.shape[0])): if vec_cnv_chr[k] > this_chr or vec_cnv_start[k] > this_end: break - overlap_lengths.append( min(vec_cnv_end[k], this_end) - max(vec_cnv_start[k], this_start) ) + overlap_lengths.append( + min(vec_cnv_end[k], this_end) - max(vec_cnv_start[k], this_start) + ) if len(overlap_lengths) > 0: seg_index[i] = j + np.argmax(overlap_lengths) for col in df_cnv.columns[df_cnv.columns.str.startswith("clone")]: df_expand[col] = np.nan - df_expand[col].iloc[seg_index>=0] = df_cnv[col].values[ seg_index[seg_index>=0] ] + df_expand[col].iloc[seg_index >= 0] = df_cnv[col].values[ + seg_index[seg_index >= 0] + ] df_expand[col] = df_expand[col].astype("Int64") if fillmissing: # for each nan row, fill it with the closest non-nan row - nan_rows = np.where( df_expand.iloc[:,-1].isnull() )[0] - filled_rows = np.where( ~df_expand.iloc[:,-1].isnull() )[0] + nan_rows = np.where(df_expand.iloc[:, -1].isnull())[0] + filled_rows = np.where(~df_expand.iloc[:, -1].isnull())[0] for i in nan_rows: - candidates = np.where( (~df_expand.iloc[:,-1].isnull()) & (df_expand.CHR.values == df_expand.CHR.values[i]) )[0] - j = candidates[ np.argmin(np.abs(candidates - i)) ] + candidates = np.where( + (~df_expand.iloc[:, -1].isnull()) + & (df_expand.CHR.values == df_expand.CHR.values[i]) + )[0] + j = candidates[np.argmin(np.abs(candidates - i))] df_expand.iloc[i, 3:] = df_expand.iloc[j, 3:].values return df_expand @@ -1241,56 +1962,108 @@ def summary_events(cnv_segfile, rescombinefile, minlength=10): # read rescombine file res_combine = dict(np.load(rescombinefile, allow_pickle=True)) pred_cnv = res_combine["pred_cnv"] - logrdr_profile = np.vstack([ res_combine["new_log_mu"][pred_cnv[:,c], c] for c in range(pred_cnv.shape[1]) ]) - baf_profile = np.vstack([ res_combine["new_p_binom"][pred_cnv[:,c], c] for c in range(pred_cnv.shape[1]) ]) + logrdr_profile = np.vstack( + [res_combine["new_log_mu"][pred_cnv[:, c], c] for c in range(pred_cnv.shape[1])] + ) + baf_profile = np.vstack( + [ + res_combine["new_p_binom"][pred_cnv[:, c], c] + for c in range(pred_cnv.shape[1]) + ] + ) # read CNV file - df_cnv = pd.read_csv(cnv_segfile, header=0, sep='\t') + df_cnv = pd.read_csv(cnv_segfile, header=0, sep="\t") # get clone names - calico_clones = np.array([ x.split(" ")[0][5:] for x in df_cnv.columns if x.endswith(" A") ]) + calico_clones = np.array( + [x.split(" ")[0][5:] for x in df_cnv.columns if x.endswith(" A")] + ) # retain only the clones that are not entirely diploid - calico_clones = [c for c in calico_clones if np.sum(np.abs(baf_profile[int(c),:] - 0.5) > EPS_BAF) > minlength ] + calico_clones = [ + c + for c in calico_clones + if np.sum(np.abs(baf_profile[int(c), :] - 0.5) > EPS_BAF) > minlength + ] # label CNV states per bin per clone into "neu", "del", "amp", "loh" states for c in calico_clones: - counts = df_cnv.END.values-df_cnv.START.values + counts = df_cnv.END.values - df_cnv.START.values counts = np.maximum(1, counts / 1e4).astype(int) - tmp = strict_convert_copy_to_states(df_cnv[f"clone{c} A"].values, df_cnv[f"clone{c} B"].values, counts=counts) + tmp = strict_convert_copy_to_states( + df_cnv[f"clone{c} A"].values, df_cnv[f"clone{c} B"].values, counts=counts + ) tmp[tmp == "bdel"] = "del" tmp[tmp == "bamp"] = "amp" df_cnv[f"srt_cnstate_clone{c}"] = tmp # partition the genome into segments such that the allele-specific CN across all clones are the same within each segment - segments, labs = get_intervals_nd(df_cnv[["CHR"] + [ f"clone{x} A" for x in calico_clones ] + [ f"clone{x} B" for x in calico_clones ]].values) + segments, labs = get_intervals_nd( + df_cnv[ + ["CHR"] + + [f"clone{x} A" for x in calico_clones] + + [f"clone{x} B" for x in calico_clones] + ].values + ) # collect event, that is labs and segments pair such that the cnstate is not normal events = [] for i, seg in enumerate(segments): if seg[1] - seg[0] < minlength: continue - if np.all(df_cnv[[ f"srt_cnstate_clone{x}" for x in calico_clones ]].iloc[seg[0],:].values == "neu"): + if np.all( + df_cnv[[f"srt_cnstate_clone{x}" for x in calico_clones]] + .iloc[seg[0], :] + .values + == "neu" + ): continue - acn_list = [ (df_cnv[f"srt_cnstate_clone{c}"].values[seg[0]], df_cnv[f"clone{c} A"].values[seg[0]], df_cnv[f"clone{c} B"].values[seg[0]]) for c in calico_clones ] + acn_list = [ + ( + df_cnv[f"srt_cnstate_clone{c}"].values[seg[0]], + df_cnv[f"clone{c} A"].values[seg[0]], + df_cnv[f"clone{c} B"].values[seg[0]], + ) + for c in calico_clones + ] acn_set = set(acn_list) for e in acn_set: if e[0] == "neu": continue - involved_clones = [calico_clones[i] for i in range(len(calico_clones)) if acn_list[i] == e] - events.append( pd.DataFrame({"CHR":df_cnv.CHR.values[seg[0]], "START":df_cnv.START.values[seg[0]], "END":df_cnv.END.values[seg[1]-1], "BinSTART":seg[0], "BinEND":seg[1]-1,\ - "CN":f"{e[1]}|{e[2]}", "Label":e[0], "involved_clones":",".join(involved_clones)}, index=[0]) ) + involved_clones = [ + calico_clones[i] for i in range(len(calico_clones)) if acn_list[i] == e + ] + events.append( + pd.DataFrame( + { + "CHR": df_cnv.CHR.values[seg[0]], + "START": df_cnv.START.values[seg[0]], + "END": df_cnv.END.values[seg[1] - 1], + "BinSTART": seg[0], + "BinEND": seg[1] - 1, + "CN": f"{e[1]}|{e[2]}", + "Label": e[0], + "involved_clones": ",".join(involved_clones), + }, + index=[0], + ) + ) df_events = pd.concat(events, ignore_index=True) - + # merge adjacent events if they have the same involved_clones and same CN unique_ic = np.unique(df_events.involved_clones.values) concise_events = [] - for ic in unique_ic: + for ic in unique_ic: tmpdf = df_events[df_events.involved_clones == ic] # merge adjacent rows in tmpdf if they have the same CN END of the previous row is the same as the START of the next row - concise_events.append( tmpdf.iloc[0:1,:] ) + concise_events.append(tmpdf.iloc[0:1, :]) for i in range(1, tmpdf.shape[0]): - if tmpdf.CN.values[i] == concise_events[-1].CN.values[0] and tmpdf.CHR.values[i] == concise_events[-1].CHR.values[0] and tmpdf.START.values[i] == concise_events[-1].END.values[0]: + if ( + tmpdf.CN.values[i] == concise_events[-1].CN.values[0] + and tmpdf.CHR.values[i] == concise_events[-1].CHR.values[0] + and tmpdf.START.values[i] == concise_events[-1].END.values[0] + ): concise_events[-1].END.values[0] = tmpdf.END.values[i] concise_events[-1].BinEND.values[0] = tmpdf.BinEND.values[i] else: - concise_events.append( tmpdf.iloc[i:(i+1),:] ) + concise_events.append(tmpdf.iloc[i : (i + 1), :]) df_concise_events = pd.concat(concise_events, ignore_index=True) # add the RDR abd BAF info @@ -1299,29 +2072,60 @@ def summary_events(cnv_segfile, rescombinefile, minlength=10): rdr_diff = np.nan * np.ones(df_concise_events.shape[0]) baf_diff = np.nan * np.ones(df_concise_events.shape[0]) for i in range(df_concise_events.shape[0]): - involved_clones = np.array([int(c) for c in df_concise_events.involved_clones.values[i].split(",")]) + involved_clones = np.array( + [int(c) for c in df_concise_events.involved_clones.values[i].split(",")] + ) bs = df_concise_events.BinSTART.values[i] be = df_concise_events.BinEND.values[i] # rdr[i] = np.exp(np.mean(res_combine["new_log_mu"][ (pred_cnv[bs:be,:][:,involved_clones].flatten(), np.tile(involved_clones, be-bs)) ])) # baf[i] = np.mean(res_combine["new_p_binom"][ (pred_cnv[bs:be,:][:,involved_clones].flatten(), np.tile(involved_clones, be-bs)) ]) - rdr[i] = np.exp(np.mean( np.concatenate([logrdr_profile[i, bs:be] for i in involved_clones ]) )) - baf[i] = np.mean( np.concatenate([baf_profile[i, bs:be] for i in involved_clones ]) ) + rdr[i] = np.exp( + np.mean(np.concatenate([logrdr_profile[i, bs:be] for i in involved_clones])) + ) + baf[i] = np.mean( + np.concatenate([baf_profile[i, bs:be] for i in involved_clones]) + ) # get the uninvolved clones - uninvolved_clones = np.array([int(c)for c in calico_clones if int(c) not in involved_clones]) + uninvolved_clones = np.array( + [int(c) for c in calico_clones if int(c) not in involved_clones] + ) if len(uninvolved_clones) > 0: # rdr_diff[i] = np.exp(np.mean(res_combine["new_log_mu"][ (pred_cnv[bs:be,:][:,uninvolved_clones].flatten(), np.tile(uninvolved_clones, be-bs)) ])) - rdr[i] # baf_diff[i] = np.mean(res_combine["new_p_binom"][ (pred_cnv[bs:be,:][:,uninvolved_clones].flatten(), np.tile(uninvolved_clones, be-bs)) ]) - baf[i] - rdr_diff[i] = rdr[i] - np.exp(np.mean( np.concatenate([logrdr_profile[i, bs:be] for i in uninvolved_clones ]) )) - baf_diff[i] = baf[i] - np.mean( np.concatenate([baf_profile[i, bs:be] for i in uninvolved_clones ]) ) + rdr_diff[i] = rdr[i] - np.exp( + np.mean( + np.concatenate( + [logrdr_profile[i, bs:be] for i in uninvolved_clones] + ) + ) + ) + baf_diff[i] = baf[i] - np.mean( + np.concatenate([baf_profile[i, bs:be] for i in uninvolved_clones]) + ) df_concise_events["RDR"] = rdr df_concise_events["BAF"] = baf df_concise_events["RDR_diff"] = rdr_diff df_concise_events["BAF_diff"] = baf_diff - return df_concise_events[["CHR", "START", "END", "BinSTART", "BinEND", "RDR", "BAF", "RDR_diff", "BAF_diff", "CN", "Label", "involved_clones"]] - - -def get_best_initialization(output_dir): + return df_concise_events[ + [ + "CHR", + "START", + "END", + "BinSTART", + "BinEND", + "RDR", + "BAF", + "RDR_diff", + "BAF_diff", + "CN", + "Label", + "involved_clones", + ] + ] + + +def get_best_initialization(output_dir): """ find the best HMRF initialization random seed """ @@ -1331,7 +2135,12 @@ def get_best_initialization(output_dir): for file in rdrbaf_files: outdir = file.parent res_combine = dict(np.load(str(file)), allow_pickle=True) - df.append( pd.DataFrame({'outdir':str(outdir), "log-likelihood":res_combine["total_llf"]}, index=[0]) ) + df.append( + pd.DataFrame( + {"outdir": str(outdir), "log-likelihood": res_combine["total_llf"]}, + index=[0], + ) + ) df = pd.concat(df, ignore_index=True) idx = np.argmax(df["log-likelihood"]) return df["outdir"].iloc[idx] diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 7e3cbbb..6f6ec02 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -27,8 +27,8 @@ def convert_params(mean, std): See https://mathworld.wolfram.com/NegativeBinomialDistribution.html """ - p = mean/std**2 - n = mean*p/(1.0 - p) + p = mean / std**2 + n = mean * p / (1.0 - p) return n, p @@ -51,11 +51,13 @@ class Weighted_NegativeBinomial(GenericLikelihoodModel): exposure : array, (n_samples,) Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ + def __init__(self, endog, exog, weights, exposure, seed=0, **kwds): super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds) self.weights = weights self.exposure = exposure self.seed = seed + # def nloglikeobs(self, params): nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure @@ -64,18 +66,19 @@ def nloglikeobs(self, params): llf = scipy.stats.nbinom.logpmf(self.endog, n, p) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): - self.exog_names.append('alpha') + self.exog_names.append("alpha") if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - return super(Weighted_NegativeBinomial, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + return super(Weighted_NegativeBinomial, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class Weighted_NegativeBinomial_mix(GenericLikelihoodModel): @@ -85,25 +88,29 @@ def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds): self.exposure = exposure self.seed = seed self.tumor_prop = tumor_prop + # def nloglikeobs(self, params): - nb_mean = self.exposure * (self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop) + nb_mean = self.exposure * ( + self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop + ) nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) llf = scipy.stats.nbinom.logpmf(self.endog, n, p) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): - self.exog_names.append('alpha') + self.exog_names.append("alpha") if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - return super(Weighted_NegativeBinomial_mix, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + return super(Weighted_NegativeBinomial_mix, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class Weighted_BetaBinom(GenericLikelihoodModel): @@ -125,10 +132,12 @@ class Weighted_BetaBinom(GenericLikelihoodModel): exposure : array, (n_samples,) Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. """ + def __init__(self, endog, exog, weights, exposure, **kwds): super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds) self.weights = weights self.exposure = exposure + # def nloglikeobs(self, params): a = (self.exog @ params[:-1]) * params[-1] @@ -136,17 +145,20 @@ def nloglikeobs(self, params): llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): self.exog_names.append("tau") if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: - start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) - return super(Weighted_BetaBinom, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + start_params = np.append( + 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 + ) + return super(Weighted_BetaBinom, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class Weighted_BetaBinom_mix(GenericLikelihoodModel): @@ -155,24 +167,33 @@ def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds): self.weights = weights self.exposure = exposure self.tumor_prop = tumor_prop + # def nloglikeobs(self, params): - a = (self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1] - b = ((1 - self.exog @ params[:-1]) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1] + a = ( + self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop) + ) * params[-1] + b = ( + (1 - self.exog @ params[:-1]) * self.tumor_prop + + 0.5 * (1 - self.tumor_prop) + ) * params[-1] llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): self.exog_names.append("tau") if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: - start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) - return super(Weighted_BetaBinom_mix, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + start_params = np.append( + 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 + ) + return super(Weighted_BetaBinom_mix, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): @@ -181,6 +202,7 @@ def __init__(self, endog, exog, tau, weights, exposure, **kwds): self.tau = tau self.weights = weights self.exposure = exposure + # def nloglikeobs(self, params): a = (self.exog @ params) * self.tau @@ -188,17 +210,18 @@ def nloglikeobs(self, params): llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: start_params = 0.1 * np.ones(self.nparams) - - return super(Weighted_BetaBinom_fixdispersion, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + + return super(Weighted_BetaBinom_fixdispersion, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel): @@ -208,24 +231,30 @@ def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds): self.weights = weights self.exposure = exposure self.tumor_prop = tumor_prop + # def nloglikeobs(self, params): - a = (self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau - b = ((1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau + a = ( + self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop) + ) * self.tau + b = ( + (1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop) + ) * self.tau llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: start_params = 0.1 * np.ones(self.nparams) - - return super(Weighted_BetaBinom_fixdispersion_mix, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) + + return super(Weighted_BetaBinom_fixdispersion_mix, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) class BAF_Binom(GenericLikelihoodModel): @@ -247,12 +276,14 @@ class BAF_Binom(GenericLikelihoodModel): exposure : array, (n_samples,) Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. """ + def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds): super(BAF_Binom, self).__init__(endog, exog, **kwds) self.weights = weights self.exposure = exposure self.offset = offset self.scaling = scaling + # def nloglikeobs(self, params): linear_term = self.exog @ params @@ -260,13 +291,14 @@ def nloglikeobs(self, params): llf = scipy.stats.binom.logpmf(self.endog, self.exposure, p) neg_sum_llf = -llf.dot(self.weights) return neg_sum_llf + # def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): if start_params is None: - if hasattr(self, 'start_params'): + if hasattr(self, "start_params"): start_params = self.start_params else: - start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams) - return super(BAF_Binom, self).fit(start_params=start_params, - maxiter=maxiter, maxfun=maxfun, - **kwds) \ No newline at end of file + start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams) + return super(BAF_Binom, self).fit( + start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + ) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 9c22aaf..2a22f4d 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -27,11 +27,11 @@ def np_max_ax_keep(arr, axis=0): assert arr.ndim == 2 assert axis in [0, 1] if axis == 0: - result = np.zeros( (1, arr.shape[1]) ) + result = np.zeros((1, arr.shape[1])) for i in range(result.shape[1]): result[:, i] = np.max(arr[:, i]) else: - result = np.zeros( (arr.shape[0], 1) ) + result = np.zeros((arr.shape[0], 1)) for i in range(result.shape[0]): result[i, :] = np.max(arr[i, :]) return result @@ -57,11 +57,11 @@ def np_sum_ax_keep(arr, axis=0): assert arr.ndim == 2 assert axis in [0, 1] if axis == 0: - result = np.zeros( (1, arr.shape[1]) ) + result = np.zeros((1, arr.shape[1])) for i in range(result.shape[1]): result[:, i] = np.sum(arr[:, i]) else: - result = np.zeros( (arr.shape[0], 1) ) + result = np.zeros((arr.shape[0], 1)) for i in range(result.shape[0]): result[i, :] = np.sum(arr[i, :]) return result @@ -81,26 +81,27 @@ def np_mean_ax_squeeze(arr, axis=0): result[i] = np.mean(arr[i, :]) return result + @njit def np_mean_ax_keep(arr, axis=0): assert arr.ndim == 2 assert axis in [0, 1] if axis == 0: - result = np.zeros( (1, arr.shape[1]) ) + result = np.zeros((1, arr.shape[1])) for i in range(result.shape[1]): result[:, i] = np.mean(arr[:, i]) else: - result = np.zeros( (arr.shape[0], 1) ) + result = np.zeros((arr.shape[0], 1)) for i in range(result.shape[0]): result[i, :] = np.mean(arr[i, :]) return result -@njit +@njit def mylogsumexp(a): # get max a_max = np.max(a) - if (np.isinf(a_max)): + if np.isinf(a_max): return a_max # exponential tmp = np.exp(a - a_max) @@ -110,7 +111,7 @@ def mylogsumexp(a): return s + a_max -@njit +@njit def mylogsumexp_ax_keep(a, axis): # get max a_max = np_max_ax_keep(a, axis=axis) @@ -132,7 +133,7 @@ def construct_unique_matrix(obs_count, total_count): ---------- allele_count : array, shape (n_observations, n_spots) Observed A allele counts per SNP per spot. - + total_bb_RD : array, shape (n_observations, n_spots) Total SNP-covering reads per SNP per spot. """ @@ -142,41 +143,69 @@ def construct_unique_matrix(obs_count, total_count): mapping_matrices = [] for s in range(n_spots): if total_count.dtype == int: - pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T, axis=0 ) + pairs = np.unique(np.vstack([obs_count[:, s], total_count[:, s]]).T, axis=0) else: - pairs = np.unique( np.vstack([obs_count[:,s], total_count[:,s]]).T.round(decimals=4), axis=0 ) - unique_values.append( pairs ) - pair_index = {(pairs[i,0], pairs[i,1]):i for i in range(pairs.shape[0])} + pairs = np.unique( + np.vstack([obs_count[:, s], total_count[:, s]]).T.round(decimals=4), + axis=0, + ) + unique_values.append(pairs) + pair_index = {(pairs[i, 0], pairs[i, 1]): i for i in range(pairs.shape[0])} # construct mapping matrix mat_row = np.arange(n_obs) mat_col = np.zeros(n_obs, dtype=int) for i in range(n_obs): if total_count.dtype == int: - tmpidx = pair_index[(obs_count[i,s], total_count[i,s])] + tmpidx = pair_index[(obs_count[i, s], total_count[i, s])] else: - tmpidx = pair_index[(obs_count[i,s], total_count[i,s].round(decimals=4))] + tmpidx = pair_index[ + (obs_count[i, s], total_count[i, s].round(decimals=4)) + ] mat_col[i] = tmpidx - mapping_matrices.append( scipy.sparse.csr_matrix((np.ones(len(mat_row)), (mat_row, mat_col) )) ) + mapping_matrices.append( + scipy.sparse.csr_matrix((np.ones(len(mat_row)), (mat_row, mat_col))) + ) return unique_values, mapping_matrices -def initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random_state=None, in_log_space=True, only_minor=True, min_binom_prob=0.1, max_binom_prob=0.9): +def initialization_by_gmm( + n_states, + X, + base_nb_mean, + total_bb_RD, + params, + random_state=None, + in_log_space=True, + only_minor=True, + min_binom_prob=0.1, + max_binom_prob=0.9, +): # prepare gmm input of RDR and BAF separately X_gmm_rdr = None X_gmm_baf = None if "m" in params: if in_log_space: - X_gmm_rdr = np.vstack([ np.log(X[:,0,s]/base_nb_mean[:,s]) for s in range(X.shape[2]) ]).T + X_gmm_rdr = np.vstack( + [np.log(X[:, 0, s] / base_nb_mean[:, s]) for s in range(X.shape[2])] + ).T offset = np.mean(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]) - normalizetomax1 = np.max(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]) - np.min(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]) + normalizetomax1 = np.max( + X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))] + ) - np.min(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]) X_gmm_rdr = (X_gmm_rdr - offset) / normalizetomax1 else: - X_gmm_rdr = np.vstack([ X[:,0,s]/base_nb_mean[:,s] for s in range(X.shape[2]) ]).T + X_gmm_rdr = np.vstack( + [X[:, 0, s] / base_nb_mean[:, s] for s in range(X.shape[2])] + ).T offset = 0 - normalizetomax1 = np.max(X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))]) + normalizetomax1 = np.max( + X_gmm_rdr[(~np.isnan(X_gmm_rdr)) & (~np.isinf(X_gmm_rdr))] + ) X_gmm_rdr = (X_gmm_rdr - offset) / normalizetomax1 if "p" in params: - X_gmm_baf = np.vstack([ X[:,1,s] / total_bb_RD[:,s] for s in range(X.shape[2]) ]).T + X_gmm_baf = np.vstack( + [X[:, 1, s] / total_bb_RD[:, s] for s in range(X.shape[2])] + ).T X_gmm_baf[X_gmm_baf < min_binom_prob] = min_binom_prob X_gmm_baf[X_gmm_baf > max_binom_prob] = max_binom_prob # combine RDR and BAF @@ -203,21 +232,31 @@ def initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random if random_state is None: gmm = GaussianMixture(n_components=n_states, max_iter=1).fit(X_gmm) else: - gmm = GaussianMixture(n_components=n_states, max_iter=1, random_state=random_state).fit(X_gmm) + gmm = GaussianMixture( + n_components=n_states, max_iter=1, random_state=random_state + ).fit(X_gmm) # turn gmm fitted parameters to HMM log_mu and p_binom parameters if ("m" in params) and ("p" in params): - gmm_log_mu = gmm.means_[:,:X.shape[2]] * normalizetomax1 + offset if in_log_space else np.log(gmm.means_[:,:X.shape[2]] * normalizetomax1 + offset) - gmm_p_binom = gmm.means_[:, X.shape[2]:] + gmm_log_mu = ( + gmm.means_[:, : X.shape[2]] * normalizetomax1 + offset + if in_log_space + else np.log(gmm.means_[:, : X.shape[2]] * normalizetomax1 + offset) + ) + gmm_p_binom = gmm.means_[:, X.shape[2] :] if only_minor: - gmm_p_binom = np.where(gmm_p_binom > 0.5, 1-gmm_p_binom, gmm_p_binom) + gmm_p_binom = np.where(gmm_p_binom > 0.5, 1 - gmm_p_binom, gmm_p_binom) elif "m" in params: - gmm_log_mu = gmm.means_ * normalizetomax1 + offset if in_log_space else np.log(gmm.means_[:,:X.shape[2]] * normalizetomax1 + offset) + gmm_log_mu = ( + gmm.means_ * normalizetomax1 + offset + if in_log_space + else np.log(gmm.means_[:, : X.shape[2]] * normalizetomax1 + offset) + ) gmm_p_binom = None elif "p" in params: gmm_log_mu = None gmm_p_binom = gmm.means_ if only_minor: - gmm_p_binom = np.where(gmm_p_binom > 0.5, 1-gmm_p_binom, gmm_p_binom) + gmm_p_binom = np.where(gmm_p_binom > 0.5, 1 - gmm_p_binom, gmm_p_binom) return gmm_log_mu, gmm_p_binom @@ -225,14 +264,15 @@ def initialization_by_gmm(n_states, X, base_nb_mean, total_bb_RD, params, random # E step related ############################################################ + def compute_posterior_obs(log_alpha, log_beta): - ''' + """ Input log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda). log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda). Output: log_gamma: size n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). gamma[i, t] propto alpha[i,t] * beta[i,t] - ''' + """ n_states = log_alpha.shape[0] n_obs = log_alpha.shape[1] # initial log_gamma @@ -242,15 +282,17 @@ def compute_posterior_obs(log_alpha, log_beta): # for t in np.arange(n_obs): # log_gamma[j, t] = log_alpha[j, t] + log_beta[j, t] log_gamma = log_alpha + log_beta - if np.any( np.sum(log_gamma, axis=0) == 0 ): + if np.any(np.sum(log_gamma, axis=0) == 0): raise Exception("Sum of posterior probability is zero for some observations!") log_gamma -= scipy.special.logsumexp(log_gamma, axis=0) return log_gamma @njit -def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log_emission): - ''' +def compute_posterior_transition_sitewise( + log_alpha, log_beta, log_transmat, log_emission +): + """ Input log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda). log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda). @@ -258,27 +300,37 @@ def compute_posterior_transition_sitewise(log_alpha, log_beta, log_transmat, log log_emission: n_states * n_observations * n_spots. Log probability. Output: log_xi: size n_states * n_states * (n_observations-1). xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda) - ''' + """ n_states = int(log_alpha.shape[0] / 2) n_obs = log_alpha.shape[1] # initialize log_xi - log_xi = np.zeros((2*n_states, 2*n_states, n_obs-1)) + log_xi = np.zeros((2 * n_states, 2 * n_states, n_obs - 1)) # compute log_xi - for i in np.arange(2*n_states): - for j in np.arange(2*n_states): - for t in np.arange(n_obs-1): - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + for i in np.arange(2 * n_states): + for j in np.arange(2 * n_states): + for t in np.arange(n_obs - 1): + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_xi[i, j, t] = log_alpha[i, t] + log_transmat[i - n_states * int(i/n_states), j - n_states * int(j/n_states)] + np.sum(log_emission[j, t+1, :]) + log_beta[j, t+1] + log_xi[i, j, t] = ( + log_alpha[i, t] + + log_transmat[ + i - n_states * int(i / n_states), + j - n_states * int(j / n_states), + ] + + np.sum(log_emission[j, t + 1, :]) + + log_beta[j, t + 1] + ) # normalize - for t in np.arange(n_obs-1): + for t in np.arange(n_obs - 1): log_xi[:, :, t] -= mylogsumexp(log_xi[:, :, t]) return log_xi @njit -def compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission): - ''' +def compute_posterior_transition_nophasing( + log_alpha, log_beta, log_transmat, log_emission +): + """ Input log_alpha: output from forward_lattice_gaussian. size n_states * n_observations. alpha[j, t] = P(o_1, ... o_t, q_t = j | lambda). log_beta: output from backward_lattice_gaussian. size n_states * n_observations. beta[i, t] = P(o_{t+1}, ..., o_T | q_t = i, lambda). @@ -286,20 +338,25 @@ def compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, lo log_emission: n_states * n_observations * n_spots. Log probability. Output: log_xi: size n_states * n_states * (n_observations-1). xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda) - ''' + """ n_states = int(log_alpha.shape[0] / 2) n_obs = log_alpha.shape[1] # initialize log_xi - log_xi = np.zeros((n_states, n_states, n_obs-1)) + log_xi = np.zeros((n_states, n_states, n_obs - 1)) # compute log_xi for i in np.arange(n_states): for j in np.arange(n_states): - for t in np.arange(n_obs-1): - # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. + for t in np.arange(n_obs - 1): + # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. # But adding too many spots may lead to a higher weight of the emission rather then transition prob. - log_xi[i, j, t] = log_alpha[i, t] + log_transmat[i, j] + np.sum(log_emission[j, t+1, :]) + log_beta[j, t+1] + log_xi[i, j, t] = ( + log_alpha[i, t] + + log_transmat[i, j] + + np.sum(log_emission[j, t + 1, :]) + + log_beta[j, t + 1] + ) # normalize - for t in np.arange(n_obs-1): + for t in np.arange(n_obs - 1): log_xi[:, :, t] -= mylogsumexp(log_xi[:, :, t]) return log_xi @@ -308,18 +365,21 @@ def compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, lo # M step related (HMM phasing) ############################################################ + @njit def update_startprob_sitewise(lengths, log_gamma): - ''' + """ Input lengths: sum of lengths = n_observations. log_gamma: size 2 * n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). Output log_startprob: n_states. Start probability after loog transformation. - ''' + """ n_states = int(log_gamma.shape[0] / 2) n_obs = log_gamma.shape[1] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the second dimension of log_gamma!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the second dimension of log_gamma!" # indices of the start of sequences, given that the length of each sequence is in lengths cumlen = 0 indices_start = [] @@ -332,7 +392,7 @@ def update_startprob_sitewise(lengths, log_gamma): # compute log_startprob of 2 * n_states log_startprob = mylogsumexp_ax_keep(log_gamma[:, indices_start], axis=1) # merge (CNV state, phase A) and (CNV state, phase B) - log_startprob = log_startprob.flatten().reshape(2,-1) + log_startprob = log_startprob.flatten().reshape(2, -1) log_startprob = mylogsumexp_ax_keep(log_startprob, axis=0) # normalize such that startprob sums to 1 log_startprob -= mylogsumexp(log_startprob) @@ -340,20 +400,28 @@ def update_startprob_sitewise(lengths, log_gamma): def update_transition_sitewise(log_xi, is_diag=False): - ''' + """ Input log_xi: size (2*n_states) * (2*n_states) * n_observations. xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda) Output log_transmat: n_states * n_states. Transition probability after log transformation. - ''' + """ n_states = int(log_xi.shape[0] / 2) n_obs = log_xi.shape[2] # initialize log_transmat log_transmat = np.zeros((n_states, n_states)) for i in np.arange(n_states): for j in np.arange(n_states): - log_transmat[i, j] = scipy.special.logsumexp( np.concatenate([log_xi[i, j, :], log_xi[i+n_states, j, :], \ - log_xi[i, j+n_states, :], log_xi[i + n_states, j + n_states, :]]) ) + log_transmat[i, j] = scipy.special.logsumexp( + np.concatenate( + [ + log_xi[i, j, :], + log_xi[i + n_states, j, :], + log_xi[i, j + n_states, :], + log_xi[i + n_states, j + n_states, :], + ] + ) + ) # row normalize log_transmat if not is_diag: for i in np.arange(n_states): @@ -363,14 +431,25 @@ def update_transition_sitewise(log_xi, is_diag=False): diagsum = scipy.special.logsumexp(np.diag(log_transmat)) totalsum = scipy.special.logsumexp(log_transmat) t = diagsum - totalsum - rest = np.log( (1 - np.exp(t)) / (n_states-1) ) + rest = np.log((1 - np.exp(t)) / (n_states - 1)) log_transmat = np.ones(log_transmat.shape) * rest np.fill_diagonal(log_transmat, t) return log_transmat -def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2, min_estep_weight=0.1): +def update_emission_params_nb_sitewise_uniqvalues( + unique_values, + mapping_matrices, + log_gamma, + base_nb_mean, + alphas, + start_log_mu=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + min_log_rdr=-2, + max_log_rdr=2, + min_estep_weight=0.1, +): """ Attributes ---------- @@ -387,41 +466,79 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) # initialization - new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots)) + new_log_mu = ( + copy.copy(start_log_mu) + if not start_log_mu is None + else np.zeros((n_states, n_spots)) + ) new_alphas = copy.copy(alphas) # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \ - family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \ - exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero]) + model = sm.GLM( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + family=sm.families.NegativeBinomial(alpha=alphas[i, s]), + exposure=unique_values[s][idx_nonzero, 1], + var_weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array([start_log_mu[i, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if -model.loglike(res.params) < -model.loglike(res2.params) + else res2.params[0] + ) else: if not shared_NB_dispersion: for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1], \ - penalty=0) + model = Weighted_NegativeBinomial( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + penalty=0, + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] new_alphas[i, s] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_log_mu[i, s]], [alphas[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_alphas[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -429,50 +546,93 @@ def update_emission_params_nb_sitewise_uniqvalues(unique_values, mapping_matrice features = [] state_posweights = [] for s in range(n_spots): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states) - this_y = np.tile(unique_values[s][idx_nonzero,0], n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) + this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_weights = np.concatenate([ tmp[i,idx_nonzero] + tmp[i+n_states,idx_nonzero] for i in range(n_states) ]) - this_features = np.zeros((n_states*len(idx_nonzero), n_states)) + this_weights = np.concatenate( + [ + tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero] + for i in range(n_states) + ] + ) + this_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 + this_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= min_estep_weight ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) + >= min_estep_weight + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure) + model = Weighted_NegativeBinomial( + y, features, weights=weights, exposure=exposure + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] + new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_log_mu[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * alphas[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas -def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, base_nb_mean, alphas, tumor_prop, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): +def update_emission_params_nb_sitewise_uniqvalues_mix( + unique_values, + mapping_matrices, + log_gamma, + base_nb_mean, + alphas, + tumor_prop, + start_log_mu=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + min_log_rdr=-2, + max_log_rdr=2, +): """ Attributes ---------- @@ -489,42 +649,85 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) # initialization - new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots)) + new_log_mu = ( + copy.copy(start_log_mu) + if not start_log_mu is None + else np.zeros((n_states, n_spots)) + ) new_alphas = copy.copy(alphas) # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \ - family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \ - exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero]) + model = sm.GLM( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + family=sm.families.NegativeBinomial(alpha=alphas[i, s]), + exposure=unique_values[s][idx_nonzero, 1], + var_weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array([start_log_mu[i, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if -model.loglike(res.params) < -model.loglike(res2.params) + else res2.params[0] + ) else: if not shared_NB_dispersion: for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] - model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=tmp[i,idx_nonzero]+tmp[i+n_states,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s], penalty=0) + this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[ + idx_nonzero + ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ] + model = Weighted_NegativeBinomial_mix( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + tumor_prop=this_tp, + ) + # tumor_prop=tumor_prop[s], penalty=0) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] new_alphas[i, s] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_log_mu[i, s]], [alphas[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_alphas[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -533,56 +736,108 @@ def update_emission_params_nb_sitewise_uniqvalues_mix(unique_values, mapping_mat state_posweights = [] tp = [] for s in range(n_spots): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states) - this_y = np.tile(unique_values[s][idx_nonzero,0], n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) + this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states) - assert np.all(this_tp < 1+1e-4) - this_weights = np.concatenate([ tmp[i,idx_nonzero] + tmp[i+n_states,idx_nonzero] for i in range(n_states) ]) - this_features = np.zeros((n_states*len(idx_nonzero), n_states)) + this_tp = np.tile( + (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero] + / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ], + n_states, + ) + assert np.all(this_tp < 1 + 1e-4) + this_weights = np.concatenate( + [ + tmp[i, idx_nonzero] + tmp[i + n_states, idx_nonzero] + for i in range(n_states) + ] + ) + this_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 + this_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) - tp.append( this_tp[idx_row_posweight] ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) + tp.append(this_tp[idx_row_posweight]) # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0) + model = Weighted_NegativeBinomial_mix( + y, + features, + weights=weights, + exposure=exposure, + tumor_prop=tp, + penalty=0, + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] + new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_log_mu[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * alphas[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas -def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): +def update_emission_params_bb_sitewise_uniqvalues( + unique_values, + mapping_matrices, + log_gamma, + total_bb_RD, + taus, + start_p_binom=None, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + percent_threshold=0.99, + min_binom_prob=0.01, + max_binom_prob=0.99, +): """ Attributes ---------- @@ -599,44 +854,106 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) # initialization - new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 + new_p_binom = ( + copy.copy(start_p_binom) + if not start_p_binom is None + else np.ones((n_states, n_spots)) * 0.5 + ) new_taus = copy.copy(taus) if fix_BB_dispersion: for s in np.arange(len(unique_values)): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1: - model = Weighted_BetaBinom_fixdispersion(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ - np.ones(2*len(idx_nonzero)).reshape(-1,1), \ - taus[i,s], \ - weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ - exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]) ) + if ( + np.sum(tmp[i, idx_nonzero]) + np.sum(tmp[i + n_states, idx_nonzero]) + >= 0.1 + ): + model = Weighted_BetaBinom_fixdispersion( + np.append( + unique_values[s][idx_nonzero, 0], + unique_values[s][idx_nonzero, 1] + - unique_values[s][idx_nonzero, 0], + ), + np.ones(2 * len(idx_nonzero)).reshape(-1, 1), + taus[i, s], + weights=np.append( + tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero] + ), + exposure=np.append( + unique_values[s][idx_nonzero, 1], + unique_values[s][idx_nonzero, 1], + ), + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array(start_p_binom[i, s]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) else: if not shared_BB_dispersion: for s in np.arange(len(unique_values)): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1: - model = Weighted_BetaBinom(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ - np.ones(2*len(idx_nonzero)).reshape(-1,1), \ - weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ - exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]) ) + if ( + np.sum(tmp[i, idx_nonzero]) + + np.sum(tmp[i + n_states, idx_nonzero]) + >= 0.1 + ): + model = Weighted_BetaBinom( + np.append( + unique_values[s][idx_nonzero, 0], + unique_values[s][idx_nonzero, 1] + - unique_values[s][idx_nonzero, 0], + ), + np.ones(2 * len(idx_nonzero)).reshape(-1, 1), + weights=np.append( + tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero] + ), + exposure=np.append( + unique_values[s][idx_nonzero, 1], + unique_values[s][idx_nonzero, 1], + ), + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] new_taus[i, s] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_p_binom[i, s]], [taus[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_taus[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -644,51 +961,104 @@ def update_emission_params_bb_sitewise_uniqvalues(unique_values, mapping_matrice features = [] state_posweights = [] for s in np.arange(len(unique_values)): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile( np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), n_states) - this_y = np.tile( np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile( + np.append( + unique_values[s][idx_nonzero, 1], + unique_values[s][idx_nonzero, 1], + ), + n_states, + ) + this_y = np.tile( + np.append( + unique_values[s][idx_nonzero, 0], + unique_values[s][idx_nonzero, 1] + - unique_values[s][idx_nonzero, 0], + ), + n_states, + ) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_weights = np.concatenate([ np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]) for i in range(n_states) ]) - this_features = np.zeros((2*n_states*len(idx_nonzero), n_states)) + this_weights = np.concatenate( + [ + np.append(tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero]) + for i in range(n_states) + ] + ) + this_features = np.zeros((2 * n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*2*len(idx_nonzero)):((i+1)*2*len(idx_nonzero)), i] = 1 + this_features[ + (i * 2 * len(idx_nonzero)) : ((i + 1) * 2 * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] + new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_p_binom[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * taus[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus -def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, total_bb_RD, taus, tumor_prop, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): +def update_emission_params_bb_sitewise_uniqvalues_mix( + unique_values, + mapping_matrices, + log_gamma, + total_bb_RD, + taus, + tumor_prop, + start_p_binom=None, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + percent_threshold=0.99, + min_binom_prob=0.01, + max_binom_prob=0.99, +): """ Attributes ---------- @@ -705,52 +1075,122 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) # initialization - new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 + new_p_binom = ( + copy.copy(start_p_binom) + if not start_p_binom is None + else np.ones((n_states, n_spots)) * 0.5 + ) new_taus = copy.copy(taus) if fix_BB_dispersion: for s in np.arange(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1: - this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] - assert np.all(this_tp < 1+1e-4) - model = Weighted_BetaBinom_fixdispersion_mix(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ - np.ones(2*len(idx_nonzero)).reshape(-1,1), \ - taus[i,s], \ - weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ - exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s] ) + if ( + np.sum(tmp[i, idx_nonzero]) + np.sum(tmp[i + n_states, idx_nonzero]) + >= 0.1 + ): + this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[ + idx_nonzero + ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ] + assert np.all(this_tp < 1 + 1e-4) + model = Weighted_BetaBinom_fixdispersion_mix( + np.append( + unique_values[s][idx_nonzero, 0], + unique_values[s][idx_nonzero, 1] + - unique_values[s][idx_nonzero, 0], + ), + np.ones(2 * len(idx_nonzero)).reshape(-1, 1), + taus[i, s], + weights=np.append( + tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero] + ), + exposure=np.append( + unique_values[s][idx_nonzero, 1], + unique_values[s][idx_nonzero, 1], + ), + tumor_prop=this_tp, + ) + # tumor_prop=tumor_prop[s] ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array(start_p_binom[i, s]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) else: if not shared_BB_dispersion: for s in np.arange(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) + np.sum(tmp[i+n_states,idx_nonzero]) >= 0.1: - this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] - assert np.all(this_tp < 1+1e-4) - model = Weighted_BetaBinom_mix(np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), \ - np.ones(2*len(idx_nonzero)).reshape(-1,1), \ - weights=np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]), \ - exposure=np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]),\ - tumor_prop=this_tp) - # tumor_prop=tumor_prop ) + if ( + np.sum(tmp[i, idx_nonzero]) + + np.sum(tmp[i + n_states, idx_nonzero]) + >= 0.1 + ): + this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[ + idx_nonzero + ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ] + assert np.all(this_tp < 1 + 1e-4) + model = Weighted_BetaBinom_mix( + np.append( + unique_values[s][idx_nonzero, 0], + unique_values[s][idx_nonzero, 1] + - unique_values[s][idx_nonzero, 0], + ), + np.ones(2 * len(idx_nonzero)).reshape(-1, 1), + weights=np.append( + tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero] + ), + exposure=np.append( + unique_values[s][idx_nonzero, 1], + unique_values[s][idx_nonzero, 1], + ), + tumor_prop=this_tp, + ) + # tumor_prop=tumor_prop ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] new_taus[i, s] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_p_binom[i, s]], [taus[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_taus[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -759,48 +1199,98 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat state_posweights = [] tp = [] for s in np.arange(n_spots): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile( np.append(unique_values[s][idx_nonzero,1], unique_values[s][idx_nonzero,1]), n_states) - this_y = np.tile( np.append(unique_values[s][idx_nonzero,0], unique_values[s][idx_nonzero,1]-unique_values[s][idx_nonzero,0]), n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile( + np.append( + unique_values[s][idx_nonzero, 1], + unique_values[s][idx_nonzero, 1], + ), + n_states, + ) + this_y = np.tile( + np.append( + unique_values[s][idx_nonzero, 0], + unique_values[s][idx_nonzero, 1] + - unique_values[s][idx_nonzero, 0], + ), + n_states, + ) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states) - assert np.all(this_tp < 1+1e-4) - this_weights = np.concatenate([ np.append(tmp[i,idx_nonzero], tmp[i+n_states,idx_nonzero]) for i in range(n_states) ]) - this_features = np.zeros((2*n_states*len(idx_nonzero), n_states)) + this_tp = np.tile( + (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero] + / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ], + n_states, + ) + assert np.all(this_tp < 1 + 1e-4) + this_weights = np.concatenate( + [ + np.append(tmp[i, idx_nonzero], tmp[i + n_states, idx_nonzero]) + for i in range(n_states) + ] + ) + this_features = np.zeros((2 * n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*2*len(idx_nonzero)):((i+1)*2*len(idx_nonzero)), i] = 1 + this_features[ + (i * 2 * len(idx_nonzero)) : ((i + 1) * 2 * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) - tp.append( this_tp[idx_row_posweight] ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) + tp.append(this_tp[idx_row_posweight]) # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp) + model = Weighted_BetaBinom_mix( + y, features, weights=weights, exposure=exposure, tumor_prop=tp + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] + new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_p_binom[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * taus[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus @@ -811,16 +1301,18 @@ def update_emission_params_bb_sitewise_uniqvalues_mix(unique_values, mapping_mat ############################################################ @njit def update_startprob_nophasing(lengths, log_gamma): - ''' + """ Input lengths: sum of lengths = n_observations. log_gamma: size n_states * n_observations. gamma[i,t] = P(q_t = i | O, lambda). Output log_startprob: n_states. Start probability after loog transformation. - ''' + """ n_states = log_gamma.shape[0] n_obs = log_gamma.shape[1] - assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the second dimension of log_gamma!" + assert ( + np.sum(lengths) == n_obs + ), "Sum of lengths must be equal to the second dimension of log_gamma!" # indices of the start of sequences, given that the length of each sequence is in lengths cumlen = 0 indices_start = [] @@ -838,19 +1330,19 @@ def update_startprob_nophasing(lengths, log_gamma): def update_transition_nophasing(log_xi, is_diag=False): - ''' + """ Input log_xi: size (n_states) * (n_states) * n_observations. xi[i,j,t] = P(q_t=i, q_{t+1}=j | O, lambda) Output log_transmat: n_states * n_states. Transition probability after log transformation. - ''' + """ n_states = log_xi.shape[0] n_obs = log_xi.shape[2] # initialize log_transmat log_transmat = np.zeros((n_states, n_states)) for i in np.arange(n_states): for j in np.arange(n_states): - log_transmat[i, j] = scipy.special.logsumexp( log_xi[i, j, :] ) + log_transmat[i, j] = scipy.special.logsumexp(log_xi[i, j, :]) # row normalize log_transmat if not is_diag: for i in np.arange(n_states): @@ -860,14 +1352,23 @@ def update_transition_nophasing(log_xi, is_diag=False): diagsum = scipy.special.logsumexp(np.diag(log_transmat)) totalsum = scipy.special.logsumexp(log_transmat) t = diagsum - totalsum - rest = np.log( (1 - np.exp(t)) / (n_states-1) ) + rest = np.log((1 - np.exp(t)) / (n_states - 1)) log_transmat = np.ones(log_transmat.shape) * rest np.fill_diagonal(log_transmat, t) return log_transmat -def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, alphas, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): +def update_emission_params_nb_nophasing_uniqvalues( + unique_values, + mapping_matrices, + log_gamma, + alphas, + start_log_mu=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + min_log_rdr=-2, + max_log_rdr=2, +): """ Attributes ---------- @@ -884,41 +1385,79 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) # initialization - new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots)) + new_log_mu = ( + copy.copy(start_log_mu) + if not start_log_mu is None + else np.zeros((n_states, n_spots)) + ) new_alphas = copy.copy(alphas) # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \ - family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \ - exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero]) + model = sm.GLM( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + family=sm.families.NegativeBinomial(alpha=alphas[i, s]), + exposure=unique_values[s][idx_nonzero, 1], + var_weights=tmp[i, idx_nonzero], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array([start_log_mu[i, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if -model.loglike(res.params) < -model.loglike(res2.params) + else res2.params[0] + ) else: if not shared_NB_dispersion: for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - model = Weighted_NegativeBinomial(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1], \ - penalty=0) + model = Weighted_NegativeBinomial( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=tmp[i, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + penalty=0, + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] new_alphas[i, s] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_log_mu[i, s]], [alphas[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_alphas[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -926,50 +1465,88 @@ def update_emission_params_nb_nophasing_uniqvalues(unique_values, mapping_matric features = [] state_posweights = [] for s in range(n_spots): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states) - this_y = np.tile(unique_values[s][idx_nonzero,0], n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) + this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ]) - this_features = np.zeros((n_states*len(idx_nonzero), n_states)) + this_weights = np.concatenate( + [tmp[i, idx_nonzero] for i in range(n_states)] + ) + this_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 + this_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - model = Weighted_NegativeBinomial(y, features, weights=weights, exposure=exposure) + model = Weighted_NegativeBinomial( + y, features, weights=weights, exposure=exposure + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] + new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_log_mu[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * alphas[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas -def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, alphas, tumor_prop, \ - start_log_mu=None, fix_NB_dispersion=False, shared_NB_dispersion=False, min_log_rdr=-2, max_log_rdr=2): +def update_emission_params_nb_nophasing_uniqvalues_mix( + unique_values, + mapping_matrices, + log_gamma, + alphas, + tumor_prop, + start_log_mu=None, + fix_NB_dispersion=False, + shared_NB_dispersion=False, + min_log_rdr=-2, + max_log_rdr=2, +): """ Attributes ---------- @@ -986,42 +1563,85 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) # initialization - new_log_mu = copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots)) + new_log_mu = ( + copy.copy(start_log_mu) + if not start_log_mu is None + else np.zeros((n_states, n_spots)) + ) new_alphas = copy.copy(alphas) # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - model = sm.GLM(unique_values[s][idx_nonzero,0], np.ones(len(idx_nonzero)).reshape(-1,1), \ - family=sm.families.NegativeBinomial(alpha=alphas[i,s]), \ - exposure=unique_values[s][idx_nonzero,1], var_weights=tmp[i,idx_nonzero]) + model = sm.GLM( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + family=sm.families.NegativeBinomial(alpha=alphas[i, s]), + exposure=unique_values[s][idx_nonzero, 1], + var_weights=tmp[i, idx_nonzero], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array([start_log_mu[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if -model.loglike(res.params) < -model.loglike(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array([start_log_mu[i, s]]), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if -model.loglike(res.params) < -model.loglike(res2.params) + else res2.params[0] + ) else: if not shared_NB_dispersion: for s in range(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): - this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] - model = Weighted_NegativeBinomial_mix(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=tmp[i,idx_nonzero], exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s], penalty=0) + this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[ + idx_nonzero + ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ] + model = Weighted_NegativeBinomial_mix( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=tmp[i, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + tumor_prop=this_tp, + ) + # tumor_prop=tumor_prop[s], penalty=0) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_log_mu[i, s] = res.params[0] new_alphas[i, s] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_log_mu[i, s]], [alphas[i, s]]), xtol=1e-4, ftol=1e-4) - new_log_mu[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_alphas[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_log_mu[i, s]], [alphas[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_log_mu[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_alphas[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -1030,56 +1650,104 @@ def update_emission_params_nb_nophasing_uniqvalues_mix(unique_values, mapping_ma state_posweights = [] tp = [] for s in range(n_spots): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile(unique_values[s][idx_nonzero,1], n_states) - this_y = np.tile(unique_values[s][idx_nonzero,0], n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) + this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states) + this_tp = np.tile( + (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero] + / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ], + n_states, + ) assert np.all(this_tp < 1 + 1e-4) - this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ]) - this_features = np.zeros((n_states*len(idx_nonzero), n_states)) + this_weights = np.concatenate( + [tmp[i, idx_nonzero] for i in range(n_states)] + ) + this_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 + this_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) - tp.append( this_tp[idx_row_posweight] ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) + tp.append(this_tp[idx_row_posweight]) # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_NegativeBinomial_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp, penalty=0) + model = Weighted_NegativeBinomial_mix( + y, + features, + weights=weights, + exposure=exposure, + tumor_prop=tp, + penalty=0, + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_alphas[:,:] = res.params[-1] + new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_log_mu[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * alphas[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_log_mu[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * alphas[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_alphas[:,:] = res2.params[-1] + new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr return new_log_mu, new_alphas -def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matrices, log_gamma, taus, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): +def update_emission_params_bb_nophasing_uniqvalues( + unique_values, + mapping_matrices, + log_gamma, + taus, + start_p_binom=None, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + percent_threshold=0.99, + min_binom_prob=0.01, + max_binom_prob=0.99, +): """ Attributes ---------- @@ -1096,44 +1764,81 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) # initialization - new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 + new_p_binom = ( + copy.copy(start_p_binom) + if not start_p_binom is None + else np.ones((n_states, n_spots)) * 0.5 + ) new_taus = copy.copy(taus) if fix_BB_dispersion: for s in np.arange(len(unique_values)): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) >= 0.1: - model = Weighted_BetaBinom_fixdispersion(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - taus[i,s], \ - weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1] ) + if np.sum(tmp[i, idx_nonzero]) >= 0.1: + model = Weighted_BetaBinom_fixdispersion( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + taus[i, s], + weights=tmp[i, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array(start_p_binom[i, s]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) else: if not shared_BB_dispersion: for s in np.arange(len(unique_values)): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) >= 0.1: - model = Weighted_BetaBinom(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1] ) + if np.sum(tmp[i, idx_nonzero]) >= 0.1: + model = Weighted_BetaBinom( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=tmp[i, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] new_taus[i, s] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_p_binom[i, s]], [taus[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_taus[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -1141,52 +1846,88 @@ def update_emission_params_bb_nophasing_uniqvalues(unique_values, mapping_matric features = [] state_posweights = [] for s in np.arange(len(unique_values)): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile( unique_values[s][idx_nonzero,1], n_states) - this_y = np.tile( unique_values[s][idx_nonzero,0], n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) + this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ]) - this_features = np.zeros((n_states*len(idx_nonzero), n_states)) + this_weights = np.concatenate( + [tmp[i, idx_nonzero] for i in range(n_states)] + ) + this_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 + this_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] + new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_p_binom[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * taus[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus -def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_matrices, log_gamma, taus, tumor_prop, \ - start_p_binom=None, fix_BB_dispersion=False, shared_BB_dispersion=False, \ - percent_threshold=0.99, min_binom_prob=0.01, max_binom_prob=0.99): +def update_emission_params_bb_nophasing_uniqvalues_mix( + unique_values, + mapping_matrices, + log_gamma, + taus, + tumor_prop, + start_p_binom=None, + fix_BB_dispersion=False, + shared_BB_dispersion=False, + percent_threshold=0.99, + min_binom_prob=0.01, + max_binom_prob=0.99, +): """ Attributes ---------- @@ -1203,52 +1944,97 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) # initialization - new_p_binom = copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 + new_p_binom = ( + copy.copy(start_p_binom) + if not start_p_binom is None + else np.ones((n_states, n_spots)) * 0.5 + ) new_taus = copy.copy(taus) if fix_BB_dispersion: for s in np.arange(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) >= 0.1: - this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] + if np.sum(tmp[i, idx_nonzero]) >= 0.1: + this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[ + idx_nonzero + ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ] assert np.all(this_tp < 1 + 1e-4) - model = Weighted_BetaBinom_fixdispersion_mix(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - taus[i,s], \ - weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s] ) + model = Weighted_BetaBinom_fixdispersion_mix( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + taus[i, s], + weights=tmp[i, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + tumor_prop=this_tp, + ) + # tumor_prop=tumor_prop[s] ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.array(start_p_binom[i, s]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.array(start_p_binom[i, s]), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) else: if not shared_BB_dispersion: for s in np.arange(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] for i in range(n_states): # only optimize for BAF only when the posterior probability >= 0.1 (at least 1 SNP is under this state) - if np.sum(tmp[i,idx_nonzero]) >= 0.1: - this_tp = (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero] + if np.sum(tmp[i, idx_nonzero]) >= 0.1: + this_tp = (mapping_matrices[s].T @ tumor_prop[:, s])[ + idx_nonzero + ] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ] assert np.all(this_tp < 1 + 1e-4) - model = Weighted_BetaBinom_mix(unique_values[s][idx_nonzero,0], \ - np.ones(len(idx_nonzero)).reshape(-1,1), \ - weights=tmp[i,idx_nonzero], \ - exposure=unique_values[s][idx_nonzero,1], \ - tumor_prop=this_tp) - # tumor_prop=tumor_prop[s] ) + model = Weighted_BetaBinom_mix( + unique_values[s][idx_nonzero, 0], + np.ones(len(idx_nonzero)).reshape(-1, 1), + weights=tmp[i, idx_nonzero], + exposure=unique_values[s][idx_nonzero, 1], + tumor_prop=this_tp, + ) + # tumor_prop=tumor_prop[s] ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) new_p_binom[i, s] = res.params[0] new_taus[i, s] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.append([start_p_binom[i, s]], [taus[i, s]]), xtol=1e-4, ftol=1e-4) - new_p_binom[i, s] = res.params[0] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[0] - new_taus[i, s] = res.params[-1] if model.nloglikeobs(res.params) < model.nloglikeobs(res2.params) else res2.params[-1] + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.append( + [start_p_binom[i, s]], [taus[i, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) + new_p_binom[i, s] = ( + res.params[0] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[0] + ) + new_taus[i, s] = ( + res.params[-1] + if model.nloglikeobs(res.params) + < model.nloglikeobs(res2.params) + else res2.params[-1] + ) else: exposure = [] y = [] @@ -1257,49 +2043,82 @@ def update_emission_params_bb_nophasing_uniqvalues_mix(unique_values, mapping_ma state_posweights = [] tp = [] for s in np.arange(n_spots): - idx_nonzero = np.where(unique_values[s][:,1] > 0)[0] - this_exposure = np.tile( unique_values[s][idx_nonzero,1], n_states) - this_y = np.tile( unique_values[s][idx_nonzero,0], n_states) + idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] + this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) + this_y = np.tile(unique_values[s][idx_nonzero, 0], n_states) tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A - this_tp = np.tile( (mapping_matrices[s].T @ tumor_prop[:,s])[idx_nonzero] / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[idx_nonzero], n_states) + this_tp = np.tile( + (mapping_matrices[s].T @ tumor_prop[:, s])[idx_nonzero] + / (mapping_matrices[s].T @ np.ones(tumor_prop.shape[0]))[ + idx_nonzero + ], + n_states, + ) assert np.all(this_tp < 1 + 1e-4) - this_weights = np.concatenate([ tmp[i,idx_nonzero] for i in range(n_states) ]) - this_features = np.zeros((n_states*len(idx_nonzero), n_states)) + this_weights = np.concatenate( + [tmp[i, idx_nonzero] for i in range(n_states)] + ) + this_features = np.zeros((n_states * len(idx_nonzero), n_states)) for i in np.arange(n_states): - this_features[(i*len(idx_nonzero)):((i+1)*len(idx_nonzero)), i] = 1 + this_features[ + (i * len(idx_nonzero)) : ((i + 1) * len(idx_nonzero)), i + ] = 1 # only optimize for states where at least 1 SNP belongs to - idx_state_posweight = np.array([ i for i in range(this_features.shape[1]) if np.sum(this_weights[this_features[:,i]==1]) >= 0.1 ]) - idx_row_posweight = np.concatenate([ np.where(this_features[:,k]==1)[0] for k in idx_state_posweight ]) - y.append( this_y[idx_row_posweight] ) - exposure.append( this_exposure[idx_row_posweight] ) - weights.append( this_weights[idx_row_posweight] ) - features.append( this_features[idx_row_posweight, :][:, idx_state_posweight] ) - state_posweights.append( idx_state_posweight ) - tp.append( this_tp[idx_row_posweight] ) + idx_state_posweight = np.array( + [ + i + for i in range(this_features.shape[1]) + if np.sum(this_weights[this_features[:, i] == 1]) >= 0.1 + ] + ) + idx_row_posweight = np.concatenate( + [np.where(this_features[:, k] == 1)[0] for k in idx_state_posweight] + ) + y.append(this_y[idx_row_posweight]) + exposure.append(this_exposure[idx_row_posweight]) + weights.append(this_weights[idx_row_posweight]) + features.append( + this_features[idx_row_posweight, :][:, idx_state_posweight] + ) + state_posweights.append(idx_state_posweight) + tp.append(this_tp[idx_row_posweight]) # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) - model = Weighted_BetaBinom_mix(y, features, weights=weights, exposure=exposure, tumor_prop=tp) + model = Weighted_BetaBinom_mix( + y, features, weights=weights, exposure=exposure, tumor_prop=tp + ) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] if res.params[-1] > 0: - new_taus[:,:] = res.params[-1] + new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - res2 = model.fit(disp=0, maxiter=1500, start_params=np.concatenate([start_p_binom[idx_state_posweight,s] for s,idx_state_posweight in enumerate(state_posweights)] + [np.ones(1) * taus[0,s]]), xtol=1e-4, ftol=1e-4) + res2 = model.fit( + disp=0, + maxiter=1500, + start_params=np.concatenate( + [ + start_p_binom[idx_state_posweight, s] + for s, idx_state_posweight in enumerate(state_posweights) + ] + + [np.ones(1) * taus[0, s]] + ), + xtol=1e-4, + ftol=1e-4, + ) if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): - for s,idx_state_posweight in enumerate(state_posweights): - l1 = int( np.sum([len(x) for x in state_posweights[:s]]) ) - l2 = int( np.sum([len(x) for x in state_posweights[:(s+1)]]) ) + for s, idx_state_posweight in enumerate(state_posweights): + l1 = int(np.sum([len(x) for x in state_posweights[:s]])) + l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] if res2.params[-1] > 0: - new_taus[:,:] = res2.params[-1] + new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob return new_p_binom, new_taus - diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py index bee9f42..13c6830 100644 --- a/src/calicost/utils_hmrf.py +++ b/src/calicost/utils_hmrf.py @@ -13,13 +13,15 @@ def compute_adjacency_mat(coords, unit_xsquared=9, unit_ysquared=3): # pairwise distance - x_dist = coords[:,0][None,:] - coords[:,0][:,None] - y_dist = coords[:,1][None,:] - coords[:,1][:,None] + x_dist = coords[:, 0][None, :] - coords[:, 0][:, None] + y_dist = coords[:, 1][None, :] - coords[:, 1][:, None] pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared # adjacency - A = np.zeros( (coords.shape[0], coords.shape[0]), dtype=np.int8 ) + A = np.zeros((coords.shape[0], coords.shape[0]), dtype=np.int8) for i in range(coords.shape[0]): - indexes = np.where(pairwise_squared_dist[i,:] <= unit_xsquared + unit_ysquared)[0] + indexes = np.where( + pairwise_squared_dist[i, :] <= unit_xsquared + unit_ysquared + )[0] indexes = np.array([j for j in indexes if j != i]) if len(indexes) > 0: A[i, indexes] = 1 @@ -29,13 +31,15 @@ def compute_adjacency_mat(coords, unit_xsquared=9, unit_ysquared=3): def compute_adjacency_mat_v2(coords, unit_xsquared=9, unit_ysquared=3, ratio=1): # pairwise distance - x_dist = coords[:,0][None,:] - coords[:,0][:,None] - y_dist = coords[:,1][None,:] - coords[:,1][:,None] + x_dist = coords[:, 0][None, :] - coords[:, 0][:, None] + y_dist = coords[:, 1][None, :] - coords[:, 1][:, None] pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared # adjacency - A = np.zeros( (coords.shape[0], coords.shape[0]), dtype=np.int8 ) + A = np.zeros((coords.shape[0], coords.shape[0]), dtype=np.int8) for i in range(coords.shape[0]): - indexes = np.where(pairwise_squared_dist[i,:] <= ratio * (unit_xsquared + unit_ysquared))[0] + indexes = np.where( + pairwise_squared_dist[i, :] <= ratio * (unit_xsquared + unit_ysquared) + )[0] indexes = np.array([j for j in indexes if j != i]) if len(indexes) > 0: A[i, indexes] = 1 @@ -43,44 +47,60 @@ def compute_adjacency_mat_v2(coords, unit_xsquared=9, unit_ysquared=3, ratio=1): return A -def compute_weighted_adjacency(coords, unit_xsquared=9, unit_ysquared=3, bandwidth=12, decay=5): +def compute_weighted_adjacency( + coords, unit_xsquared=9, unit_ysquared=3, bandwidth=12, decay=5 +): # pairwise distance - x_dist = coords[:,0][None,:] - coords[:,0][:,None] - y_dist = coords[:,1][None,:] - coords[:,1][:,None] + x_dist = coords[:, 0][None, :] - coords[:, 0][:, None] + y_dist = coords[:, 1][None, :] - coords[:, 1][:, None] pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared - kern = np.exp(-(pairwise_squared_dist / bandwidth)**decay) + kern = np.exp(-((pairwise_squared_dist / bandwidth) ** decay)) # adjacency - A = np.zeros( (coords.shape[0], coords.shape[0]) ) + A = np.zeros((coords.shape[0], coords.shape[0])) for i in range(coords.shape[0]): - indexes = np.where(kern[i,:] > 1e-4)[0] + indexes = np.where(kern[i, :] > 1e-4)[0] indexes = np.array([j for j in indexes if j != i]) if len(indexes) > 0: - A[i, indexes] = kern[i,indexes] + A[i, indexes] = kern[i, indexes] A = scipy.sparse.csr_matrix(A) return A -def choose_adjacency_by_readcounts(coords, single_total_bb_RD, maxspots_pooling=7, unit_xsquared=9, unit_ysquared=3): -# def choose_adjacency_by_readcounts(coords, single_total_bb_RD, count_threshold=4000, unit_xsquared=9, unit_ysquared=3): +def choose_adjacency_by_readcounts( + coords, single_total_bb_RD, maxspots_pooling=7, unit_xsquared=9, unit_ysquared=3 +): + # def choose_adjacency_by_readcounts(coords, single_total_bb_RD, count_threshold=4000, unit_xsquared=9, unit_ysquared=3): # XXX: change from count_threshold 500 to 3000 # pairwise distance - x_dist = coords[:,0][None,:] - coords[:,0][:,None] - y_dist = coords[:,1][None,:] - coords[:,1][:,None] + x_dist = coords[:, 0][None, :] - coords[:, 0][:, None] + y_dist = coords[:, 1][None, :] - coords[:, 1][:, None] tmp_pairwise_squared_dist = x_dist**2 * unit_xsquared + y_dist**2 * unit_ysquared np.fill_diagonal(tmp_pairwise_squared_dist, np.max(tmp_pairwise_squared_dist)) - base_ratio = np.median(np.min(tmp_pairwise_squared_dist, axis=0)) / (unit_xsquared + unit_ysquared) + base_ratio = np.median(np.min(tmp_pairwise_squared_dist, axis=0)) / ( + unit_xsquared + unit_ysquared + ) s_ratio = 0 for ratio in range(0, 10): - smooth_mat = compute_adjacency_mat_v2(coords, unit_xsquared, unit_ysquared, ratio * base_ratio) + smooth_mat = compute_adjacency_mat_v2( + coords, unit_xsquared, unit_ysquared, ratio * base_ratio + ) smooth_mat.setdiag(1) if np.median(np.sum(smooth_mat > 0, axis=0).A.flatten()) > maxspots_pooling: s_ratio = ratio - 1 break s_ratio = ratio - smooth_mat = compute_adjacency_mat_v2(coords, unit_xsquared, unit_ysquared, s_ratio * base_ratio) + smooth_mat = compute_adjacency_mat_v2( + coords, unit_xsquared, unit_ysquared, s_ratio * base_ratio + ) smooth_mat.setdiag(1) - for bandwidth in np.arange(unit_xsquared + unit_ysquared, 15*(unit_xsquared + unit_ysquared), unit_xsquared + unit_ysquared): - adjacency_mat = compute_weighted_adjacency(coords, unit_xsquared, unit_ysquared, bandwidth=bandwidth) + for bandwidth in np.arange( + unit_xsquared + unit_ysquared, + 15 * (unit_xsquared + unit_ysquared), + unit_xsquared + unit_ysquared, + ): + adjacency_mat = compute_weighted_adjacency( + coords, unit_xsquared, unit_ysquared, bandwidth=bandwidth + ) adjacency_mat.setdiag(1) adjacency_mat = adjacency_mat - smooth_mat adjacency_mat[adjacency_mat < 0] = 0 @@ -93,7 +113,7 @@ def choose_adjacency_by_readcounts(coords, single_total_bb_RD, maxspots_pooling= def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7): """ Compute adjacency matrix for pooling and for HMRF by KNN of pairwise spatial distance + pairwise expression distance. - + Attributes ---------- coords : array, shape (n_spots, 2) @@ -111,32 +131,48 @@ def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7): n_spots = coords.shape[0] # pairwise expression distance if exp_counts is not None - pair_exp_dist = scipy.sparse.csr_matrix( np.zeros((n_spots,n_spots)) ) + pair_exp_dist = scipy.sparse.csr_matrix(np.zeros((n_spots, n_spots))) scaling_factor = 1 if not exp_counts is None: - adata = anndata.AnnData( pd.DataFrame(exp_counts) ) - sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) ) + adata = anndata.AnnData(pd.DataFrame(exp_counts)) + sc.pp.normalize_total( + adata, target_sum=np.median(np.sum(exp_counts.values, axis=1)) + ) sc.pp.log1p(adata) sc.tl.pca(adata) - pair_exp_dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(adata.obsm["X_pca"])) + pair_exp_dist = scipy.spatial.distance.squareform( + scipy.spatial.distance.pdist(adata.obsm["X_pca"]) + ) # compute the scaling factor to normalize coords such that it has the same sum of variance as PCA var_coord = np.sum(np.var(coords, axis=0)) var_pca = np.sum(np.var(adata.obsm["X_pca"], axis=0)) EPS = 1e-4 - scaling_factor = np.sqrt(var_coord / var_pca) if var_coord > EPS and var_pca > EPS else 1 + scaling_factor = ( + np.sqrt(var_coord / var_pca) if var_coord > EPS and var_pca > EPS else 1 + ) # pairwise spatial distance - pair_spatial_dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(coords / scaling_factor)) + pair_spatial_dist = scipy.spatial.distance.squareform( + scipy.spatial.distance.pdist(coords / scaling_factor) + ) # adjacency for pooling - smooth_mat = NearestNeighbors(n_neighbors=maxspots_pooling, metric='precomputed').fit(w * pair_spatial_dist + (1-w) * pair_exp_dist).kneighbors_graph() - smooth_mat.setdiag(1) # include self adjacency + smooth_mat = ( + NearestNeighbors(n_neighbors=maxspots_pooling, metric="precomputed") + .fit(w * pair_spatial_dist + (1 - w) * pair_exp_dist) + .kneighbors_graph() + ) + smooth_mat.setdiag(1) # include self adjacency # adjacency for HMRF - adjacency_mat = NearestNeighbors(n_neighbors=maxspots_pooling + 6, metric='precomputed').fit(w * pair_spatial_dist + (1-w) * pair_exp_dist).kneighbors_graph() + adjacency_mat = ( + NearestNeighbors(n_neighbors=maxspots_pooling + 6, metric="precomputed") + .fit(w * pair_spatial_dist + (1 - w) * pair_exp_dist) + .kneighbors_graph() + ) adjacency_mat = adjacency_mat - smooth_mat adjacency_mat[adjacency_mat < 0] = 0 - adjacency_mat.setdiag(1) # include self adjacency + adjacency_mat.setdiag(1) # include self adjacency return smooth_mat, adjacency_mat @@ -150,34 +186,53 @@ def choose_adjacency_by_readcounts_slidedna(coords, maxspots_pooling=30): return smooth_mat, adjacency_mat -def multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, across_slice_adjacency_mat, construct_adjacency_method, maxspots_pooling, construct_adjacency_w): +def multislice_adjacency( + sample_ids, + sample_list, + coords, + single_total_bb_RD, + exp_counts, + across_slice_adjacency_mat, + construct_adjacency_method, + maxspots_pooling, + construct_adjacency_w, +): adjacency_mat = [] smooth_mat = [] - for i,sname in enumerate(sample_list): + for i, sname in enumerate(sample_list): index = np.where(sample_ids == i)[0] - this_coords = np.array(coords[index,:]) + this_coords = np.array(coords[index, :]) if construct_adjacency_method == "hexagon": - tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts(this_coords, single_total_bb_RD[:,index], maxspots_pooling=maxspots_pooling) + tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts( + this_coords, + single_total_bb_RD[:, index], + maxspots_pooling=maxspots_pooling, + ) elif construct_adjacency_method == "KNN": - tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN(this_coords, exp_counts.iloc[index,:], w=construct_adjacency_w, maxspots_pooling=maxspots_pooling) + tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN( + this_coords, + exp_counts.iloc[index, :], + w=construct_adjacency_w, + maxspots_pooling=maxspots_pooling, + ) else: - raise("Unknown adjacency construction method") + raise ("Unknown adjacency construction method") # tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts_slidedna(this_coords, maxspots_pooling=config["maxspots_pooling"]) - adjacency_mat.append( tmpadjacency_mat.A ) - smooth_mat.append( tmpsmooth_mat.A ) + adjacency_mat.append(tmpadjacency_mat.A) + smooth_mat.append(tmpsmooth_mat.A) adjacency_mat = scipy.linalg.block_diag(*adjacency_mat) - adjacency_mat = scipy.sparse.csr_matrix( adjacency_mat ) + adjacency_mat = scipy.sparse.csr_matrix(adjacency_mat) if not across_slice_adjacency_mat is None: adjacency_mat += across_slice_adjacency_mat smooth_mat = scipy.linalg.block_diag(*smooth_mat) - smooth_mat = scipy.sparse.csr_matrix( smooth_mat ) + smooth_mat = scipy.sparse.csr_matrix(smooth_mat) return adjacency_mat, smooth_mat def rectangle_initialize_initial_clone(coords, n_clones, random_state=0): """ Initialize clone assignment by partition space into p * p blocks (s.t. p * p >= n_clones), and assign each block a clone id. - + Attributes ---------- coords : array, shape (n_spots, 2) @@ -194,18 +249,18 @@ def rectangle_initialize_initial_clone(coords, n_clones, random_state=0): np.random.seed(random_state) p = int(np.ceil(np.sqrt(n_clones))) # partition the range of x and y axes - px = np.random.dirichlet( np.ones(p) * 10 ) + px = np.random.dirichlet(np.ones(p) * 10) px[-1] += 1e-4 - xrange = [np.percentile(coords[:,0], 5), np.percentile(coords[:,0], 95)] + xrange = [np.percentile(coords[:, 0], 5), np.percentile(coords[:, 0], 95)] xboundary = xrange[0] + (xrange[1] - xrange[0]) * np.cumsum(px) - xboundary[-1] = np.max(coords[:,0]) + 1 - xdigit = np.digitize(coords[:,0], xboundary, right=True) - py = np.random.dirichlet( np.ones(p) * 10 ) + xboundary[-1] = np.max(coords[:, 0]) + 1 + xdigit = np.digitize(coords[:, 0], xboundary, right=True) + py = np.random.dirichlet(np.ones(p) * 10) py[-1] += 1e-4 - yrange = [np.percentile(coords[:,1], 5), np.percentile(coords[:,1], 95)] + yrange = [np.percentile(coords[:, 1], 5), np.percentile(coords[:, 1], 95)] yboundary = yrange[0] + (yrange[1] - yrange[0]) * np.cumsum(py) - yboundary[-1] = np.max(coords[:,1]) + 1 - ydigit = np.digitize(coords[:,1], yboundary, right=True) + yboundary[-1] = np.max(coords[:, 1]) + 1 + ydigit = np.digitize(coords[:, 1], yboundary, right=True) block_id = xdigit * p + ydigit # assigning blocks to clone (note that if sqrt(n_clone) is not an integer, multiple blocks can be assigneed to one clone) # block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2) @@ -220,109 +275,137 @@ def rectangle_initialize_initial_clone(coords, n_clones, random_state=0): block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2) while len(np.unique(block_clone_map)) < n_clones: bc = np.bincount(block_clone_map, minlength=n_clones) - assert np.any(bc==0) - block_clone_map[np.where(block_clone_map==np.argmax(bc))[0][0]] = np.where(bc==0)[0][0] - block_clone_map = {i:block_clone_map[i] for i in range(len(block_clone_map))} + assert np.any(bc == 0) + block_clone_map[np.where(block_clone_map == np.argmax(bc))[0][0]] = ( + np.where(bc == 0)[0][0] + ) + block_clone_map = {i: block_clone_map[i] for i in range(len(block_clone_map))} clone_id = np.array([block_clone_map[i] for i in block_id]) initial_clone_index = [np.where(clone_id == i)[0] for i in range(n_clones)] - if np.min([len(x) for x in initial_clone_index]) > 0.2 * coords.shape[0] / n_clones: + if ( + np.min([len(x) for x in initial_clone_index]) + > 0.2 * coords.shape[0] / n_clones + ): break return initial_clone_index def fixed_rectangle_initialization(coords, x_part, y_part): # - px = np.linspace(0, 1, x_part+1) + px = np.linspace(0, 1, x_part + 1) px[-1] += 0.01 px = px[1:] - xrange = [np.min(coords[:,0]), np.max(coords[:,0])] - xdigit = np.digitize(coords[:,0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True) + xrange = [np.min(coords[:, 0]), np.max(coords[:, 0])] + xdigit = np.digitize( + coords[:, 0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True + ) # - py = np.linspace(0, 1, y_part+1) + py = np.linspace(0, 1, y_part + 1) py[-1] += 0.01 py = py[1:] - yrange = [np.min(coords[:,1]), np.max(coords[:,1])] - ydigit = np.digitize(coords[:,1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True) + yrange = [np.min(coords[:, 1]), np.max(coords[:, 1])] + ydigit = np.digitize( + coords[:, 1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True + ) # initial_clone_index = [] for xid in range(x_part): for yid in range(y_part): - initial_clone_index.append( np.where((xdigit == xid) & (ydigit == yid))[0] ) + initial_clone_index.append(np.where((xdigit == xid) & (ydigit == yid))[0]) return initial_clone_index -def merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index): +def merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index +): n_obs = single_X.shape[0] n_spots = len(clone_index) X = np.zeros((n_obs, 2, n_spots)) base_nb_mean = np.zeros((n_obs, n_spots)) total_bb_RD = np.zeros((n_obs, n_spots)) - for k,idx in enumerate(clone_index): + for k, idx in enumerate(clone_index): if len(idx) == 0: continue - X[:,:, k] = np.sum(single_X[:,:,idx], axis=2) + X[:, :, k] = np.sum(single_X[:, :, idx], axis=2) base_nb_mean[:, k] = np.sum(single_base_nb_mean[:, idx], axis=1) total_bb_RD[:, k] = np.sum(single_total_bb_RD[:, idx], axis=1) return X, base_nb_mean, total_bb_RD -def rectangle_initialize_initial_clone_mix(coords, n_clones, single_tumor_prop, threshold=0.5, random_state=0, EPS=1e-8): +def rectangle_initialize_initial_clone_mix( + coords, n_clones, single_tumor_prop, threshold=0.5, random_state=0, EPS=1e-8 +): np.random.seed(random_state) p = int(np.ceil(np.sqrt(n_clones))) # partition the range of x and y axes based on tumor spots coordinates idx_tumor = np.where(single_tumor_prop > threshold)[0] - px = np.random.dirichlet( np.ones(p) * 10 ) + px = np.random.dirichlet(np.ones(p) * 10) px[-1] -= EPS - xboundary = np.percentile(coords[idx_tumor, 0], 100*np.cumsum(px)) - xboundary[-1] = np.max(coords[:,0]) + 1 - xdigit = np.digitize(coords[:,0], xboundary, right=True) + xboundary = np.percentile(coords[idx_tumor, 0], 100 * np.cumsum(px)) + xboundary[-1] = np.max(coords[:, 0]) + 1 + xdigit = np.digitize(coords[:, 0], xboundary, right=True) ydigit = np.zeros(coords.shape[0], dtype=int) for x in range(p): - idx_tumor = np.where((single_tumor_prop > threshold) & (xdigit==x))[0] + idx_tumor = np.where((single_tumor_prop > threshold) & (xdigit == x))[0] idx_both = np.where(xdigit == x)[0] - py = np.random.dirichlet( np.ones(p) * 10 ) + py = np.random.dirichlet(np.ones(p) * 10) py[-1] -= EPS - yboundary = np.percentile(coords[idx_tumor, 1], 100*np.cumsum(py)) - yboundary[-1] = np.max(coords[:,1]) + 1 - ydigit[idx_both] = np.digitize(coords[idx_both,1], yboundary, right=True) + yboundary = np.percentile(coords[idx_tumor, 1], 100 * np.cumsum(py)) + yboundary[-1] = np.max(coords[:, 1]) + 1 + ydigit[idx_both] = np.digitize(coords[idx_both, 1], yboundary, right=True) block_id = xdigit * p + ydigit # assigning blocks to clone (note that if sqrt(n_clone) is not an integer, multiple blocks can be assigneed to one clone) block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2) while len(np.unique(block_clone_map)) < n_clones: bc = np.bincount(block_clone_map, minlength=n_clones) - assert np.any(bc==0) - block_clone_map[np.where(block_clone_map==np.argmax(bc))[0][0]] = np.where(bc==0)[0][0] - block_clone_map = {i:block_clone_map[i] for i in range(len(block_clone_map))} + assert np.any(bc == 0) + block_clone_map[np.where(block_clone_map == np.argmax(bc))[0][0]] = np.where( + bc == 0 + )[0][0] + block_clone_map = {i: block_clone_map[i] for i in range(len(block_clone_map))} clone_id = np.array([block_clone_map[i] for i in block_id]) initial_clone_index = [np.where(clone_id == i)[0] for i in range(n_clones)] return initial_clone_index -def fixed_rectangle_initialization_mix(coords, x_part, y_part, single_tumor_prop, threshold=0.5): +def fixed_rectangle_initialization_mix( + coords, x_part, y_part, single_tumor_prop, threshold=0.5 +): idx_tumor = np.where(single_tumor_prop > threshold)[0] # - px = np.linspace(0, 1, x_part+1) + px = np.linspace(0, 1, x_part + 1) px[-1] += 0.01 px = px[1:] - xrange = [np.min(coords[idx_tumor,0]), np.max(coords[idx_tumor,0])] - xdigit = np.digitize(coords[:,0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True) + xrange = [np.min(coords[idx_tumor, 0]), np.max(coords[idx_tumor, 0])] + xdigit = np.digitize( + coords[:, 0], xrange[0] + (xrange[1] - xrange[0]) * px, right=True + ) # - py = np.linspace(0, 1, y_part+1) + py = np.linspace(0, 1, y_part + 1) py[-1] += 0.01 py = py[1:] - yrange = [np.min(coords[idx_tumor,1]), np.max(coords[idx_tumor,1])] - ydigit = np.digitize(coords[:,1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True) + yrange = [np.min(coords[idx_tumor, 1]), np.max(coords[idx_tumor, 1])] + ydigit = np.digitize( + coords[:, 1], yrange[0] + (yrange[1] - yrange[0]) * py, right=True + ) # initial_clone_index = [] for xid in range(x_part): for yid in range(y_part): - initial_clone_index.append( np.where((xdigit == xid) & (ydigit == yid))[0] ) + initial_clone_index.append(np.where((xdigit == xid) & (ydigit == yid))[0]) return initial_clone_index -def merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop, threshold=0.5): +def merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + threshold=0.5, +): n_obs = single_X.shape[0] n_spots = len(clone_index) X = np.zeros((n_obs, 2, n_spots)) @@ -330,11 +413,11 @@ def merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb total_bb_RD = np.zeros((n_obs, n_spots)) tumor_prop = np.zeros(n_spots) - for k,idx in enumerate(clone_index): + for k, idx in enumerate(clone_index): if len(idx) == 0: continue idx = idx[np.where(single_tumor_prop[idx] > threshold)[0]] - X[:,:, k] = np.sum(single_X[:,:,idx], axis=2) + X[:, :, k] = np.sum(single_X[:, :, idx], axis=2) base_nb_mean[:, k] = np.sum(single_base_nb_mean[:, idx], axis=1) total_bb_RD[:, k] = np.sum(single_total_bb_RD[:, idx], axis=1) tumor_prop[k] = np.mean(single_tumor_prop[idx]) if len(idx) > 0 else 0 @@ -352,13 +435,19 @@ def reorder_results(res_combine, posterior, single_tumor_prop): if single_tumor_prop is None: # select near-normal clone and set to clone 0 pred_cnv = res_combine["pred_cnv"] - baf_profiles = np.array([ res_combine["new_p_binom"][pred_cnv[:,c], c] for c in range(n_clones) ]) - cid_normal = np.argmin(np.sum( np.maximum(np.abs(baf_profiles - 0.5)-EPS_BAF, 0), axis=1)) + baf_profiles = np.array( + [res_combine["new_p_binom"][pred_cnv[:, c], c] for c in range(n_clones)] + ) + cid_normal = np.argmin( + np.sum(np.maximum(np.abs(baf_profiles - 0.5) - EPS_BAF, 0), axis=1) + ) cid_rest = np.array([c for c in range(n_clones) if c != cid_normal]).astype(int) reidx = np.append(cid_normal, cid_rest) - map_reidx = {cid:i for i,cid in enumerate(reidx)} + map_reidx = {cid: i for i, cid in enumerate(reidx)} # re-order entries in res_combine - new_res_combine["new_assignment"] = np.array([ map_reidx[c] for c in res_combine["new_assignment"] ]) + new_res_combine["new_assignment"] = np.array( + [map_reidx[c] for c in res_combine["new_assignment"]] + ) new_res_combine["new_log_mu"] = res_combine["new_log_mu"][:, reidx] new_res_combine["new_alphas"] = res_combine["new_alphas"][:, reidx] new_res_combine["new_p_binom"] = res_combine["new_p_binom"][:, reidx] @@ -369,59 +458,109 @@ def reorder_results(res_combine, posterior, single_tumor_prop): else: # add normal clone as clone 0 new_res_combine["new_assignment"] = new_res_combine["new_assignment"] + 1 - new_res_combine["new_log_mu"] = np.hstack([np.zeros((n_states,1)), res_combine["new_log_mu"]]) - new_res_combine["new_alphas"] = np.hstack([np.zeros((n_states,1)), res_combine["new_alphas"]]) - new_res_combine["new_p_binom"] = np.hstack([0.5 * np.ones((n_states,1)), res_combine["new_p_binom"]]) - new_res_combine["new_taus"] = np.hstack([np.zeros((n_states,1)), res_combine["new_taus"]]) - new_res_combine["log_gamma"] = np.dstack([np.zeros((n_states, n_obs, 1)), res_combine["log_gamma"]]) - new_res_combine["pred_cnv"] = np.hstack([np.zeros((n_obs,1), dtype=int), res_combine["pred_cnv"]]) - new_posterior = np.hstack([np.ones((n_spots,1)) * np.nan, posterior]) + new_res_combine["new_log_mu"] = np.hstack( + [np.zeros((n_states, 1)), res_combine["new_log_mu"]] + ) + new_res_combine["new_alphas"] = np.hstack( + [np.zeros((n_states, 1)), res_combine["new_alphas"]] + ) + new_res_combine["new_p_binom"] = np.hstack( + [0.5 * np.ones((n_states, 1)), res_combine["new_p_binom"]] + ) + new_res_combine["new_taus"] = np.hstack( + [np.zeros((n_states, 1)), res_combine["new_taus"]] + ) + new_res_combine["log_gamma"] = np.dstack( + [np.zeros((n_states, n_obs, 1)), res_combine["log_gamma"]] + ) + new_res_combine["pred_cnv"] = np.hstack( + [np.zeros((n_obs, 1), dtype=int), res_combine["pred_cnv"]] + ) + new_posterior = np.hstack([np.ones((n_spots, 1)) * np.nan, posterior]) return new_res_combine, new_posterior def reorder_results_merged(res, n_obs): n_clones = int(len(res["pred_cnv"]) / n_obs) EPS_BAF = 0.05 - pred_cnv = np.array([ res["pred_cnv"][(c*n_obs):(c*n_obs + n_obs)] for c in range(n_clones) ]).T - baf_profiles = np.array([ res["new_p_binom"][pred_cnv[:,c], 0] for c in range(n_clones) ]) - cid_normal = np.argmin(np.sum( np.maximum(np.abs(baf_profiles - 0.5)-EPS_BAF, 0), axis=1)) + pred_cnv = np.array( + [res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_clones)] + ).T + baf_profiles = np.array( + [res["new_p_binom"][pred_cnv[:, c], 0] for c in range(n_clones)] + ) + cid_normal = np.argmin( + np.sum(np.maximum(np.abs(baf_profiles - 0.5) - EPS_BAF, 0), axis=1) + ) cid_rest = np.array([c for c in range(n_clones) if c != cid_normal]) reidx = np.append(cid_normal, cid_rest) - map_reidx = {cid:i for i,cid in enumerate(reidx)} + map_reidx = {cid: i for i, cid in enumerate(reidx)} # re-order entries in res new_res = copy.copy(res) - new_res["new_assignment"] = np.array([ map_reidx[c] for c in res["new_assignment"] ]) - new_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c*n_obs):(c*n_obs + n_obs)] for c in reidx ]) - new_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c*n_obs):(c*n_obs + n_obs)] for c in reidx ]) + new_res["new_assignment"] = np.array([map_reidx[c] for c in res["new_assignment"]]) + new_res["log_gamma"] = np.hstack( + [res["log_gamma"][:, (c * n_obs) : (c * n_obs + n_obs)] for c in reidx] + ) + new_res["pred_cnv"] = np.concatenate( + [res["pred_cnv"][(c * n_obs) : (c * n_obs + n_obs)] for c in reidx] + ) return new_res - + def load_hmrf_last_iteration(filename): - allres = dict( np.load(filename, allow_pickle=True) ) + allres = dict(np.load(filename, allow_pickle=True)) r = allres["num_iterations"] - 1 - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } if "barcodes" in allres.keys(): res["barcodes"] = allres["barcodes"] return res def load_hmrf_given_iteration(filename, r): - allres = dict( np.load(filename, allow_pickle=True) ) - res = {"new_log_mu":allres[f"round{r}_new_log_mu"], "new_alphas":allres[f"round{r}_new_alphas"], \ - "new_p_binom":allres[f"round{r}_new_p_binom"], "new_taus":allres[f"round{r}_new_taus"], \ - "new_log_startprob":allres[f"round{r}_new_log_startprob"], "new_log_transmat":allres[f"round{r}_new_log_transmat"], "log_gamma":allres[f"round{r}_log_gamma"], \ - "pred_cnv":allres[f"round{r}_pred_cnv"], "llf":allres[f"round{r}_llf"], "total_llf":allres[f"round{r}_total_llf"], \ - "prev_assignment":allres[f"round{r-1}_assignment"], "new_assignment":allres[f"round{r}_assignment"]} + allres = dict(np.load(filename, allow_pickle=True)) + res = { + "new_log_mu": allres[f"round{r}_new_log_mu"], + "new_alphas": allres[f"round{r}_new_alphas"], + "new_p_binom": allres[f"round{r}_new_p_binom"], + "new_taus": allres[f"round{r}_new_taus"], + "new_log_startprob": allres[f"round{r}_new_log_startprob"], + "new_log_transmat": allres[f"round{r}_new_log_transmat"], + "log_gamma": allres[f"round{r}_log_gamma"], + "pred_cnv": allres[f"round{r}_pred_cnv"], + "llf": allres[f"round{r}_llf"], + "total_llf": allres[f"round{r}_total_llf"], + "prev_assignment": allres[f"round{r-1}_assignment"], + "new_assignment": allres[f"round{r}_assignment"], + } if "barcodes" in allres.keys(): res["barcodes"] = allres["barcodes"] return res -def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv, p_binom, min_count, EPS_BAF=0.05, COUNT_QUANTILE=0.05, MIN_TOTAL=10): +def identify_normal_spots( + single_X, + single_total_bb_RD, + new_assignment, + pred_cnv, + p_binom, + min_count, + EPS_BAF=0.05, + COUNT_QUANTILE=0.05, + MIN_TOTAL=10, +): """ Attributes ---------- @@ -443,29 +582,47 @@ def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv n_spots = single_X.shape[2] n_clones = int(len(pred_cnv) / n_obs) n_states = p_binom.shape[0] - reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order='F') + reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order="F") baf_profiles = p_binom[reshaped_pred_cnv, 0].T - id_nearnormal_clone = np.argmin(np.sum( np.maximum(np.abs(baf_profiles - 0.5)-EPS_BAF, 0), axis=1)) - umi_quantile = np.quantile(np.sum(single_X[:,0,:], axis=0), COUNT_QUANTILE) - + id_nearnormal_clone = np.argmin( + np.sum(np.maximum(np.abs(baf_profiles - 0.5) - EPS_BAF, 0), axis=1) + ) + umi_quantile = np.quantile(np.sum(single_X[:, 0, :], axis=0), COUNT_QUANTILE) + baf_deviations = np.ones(n_spots) for i in range(n_spots): - if new_assignment[i] == id_nearnormal_clone and np.sum(single_X[:,0,i]) >= umi_quantile: + if ( + new_assignment[i] == id_nearnormal_clone + and np.sum(single_X[:, 0, i]) >= umi_quantile + ): # enumerate the partition of all clones to aggregate counts, and list the BAF of each partition this_bafs = [] for c in range(n_clones): - agg_b_count = np.array([ np.sum(single_X[reshaped_pred_cnv[:,c]==s, 1, i]) for s in range(n_states) ]) - agg_t_count = np.array([ np.sum(single_total_bb_RD[reshaped_pred_cnv[:,c]==s, i]) for s in range(n_states) ]) - this_bafs.append( agg_b_count[agg_t_count>=MIN_TOTAL] / agg_t_count[agg_t_count>=MIN_TOTAL] ) + agg_b_count = np.array( + [ + np.sum(single_X[reshaped_pred_cnv[:, c] == s, 1, i]) + for s in range(n_states) + ] + ) + agg_t_count = np.array( + [ + np.sum(single_total_bb_RD[reshaped_pred_cnv[:, c] == s, i]) + for s in range(n_states) + ] + ) + this_bafs.append( + agg_b_count[agg_t_count >= MIN_TOTAL] + / agg_t_count[agg_t_count >= MIN_TOTAL] + ) this_bafs = np.concatenate(this_bafs) baf_deviations[i] = np.max(np.abs(this_bafs - 0.5)) sorted_idx = np.argsort(baf_deviations) - summed_counts = np.cumsum( np.sum(single_X[:,0,sorted_idx], axis=0) ) + summed_counts = np.cumsum(np.sum(single_X[:, 0, sorted_idx], axis=0)) n_normal = np.where(summed_counts >= min_count)[0][0] - return (baf_deviations <= baf_deviations[sorted_idx[n_normal]]) + return baf_deviations <= baf_deviations[sorted_idx[n_normal]] # def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_candidate, MIN_BAF_DEVIATION_RANGE=[0.25, 0.12], MIN_BINS_PER_STATE=10, MIN_BINS_ALL=50): @@ -477,7 +634,7 @@ def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv # new_assignment : array, shape (n_spots,) # Clone assignment for each spot. - + # pred_cnv : array, shape (n_obs * n_clones) # Copy number states across bins for each clone. @@ -532,17 +689,29 @@ def identify_normal_spots(single_X, single_total_bb_RD, new_assignment, pred_cnv # Update ideas: why not finding high purity clone and loh states together by varying BAF deviation threshold? # Current we first identify high purity clone using BAF deviation threshold = 0.15, then identify loh states. # But we can vary BAF deviation threshold from the large to small, identify high purity clones and loh states based on the same threshold. -# At very large threshold value, there will be no high purity clone, which is unreasonable. +# At very large threshold value, there will be no high purity clone, which is unreasonable. # While lowering the threshold, purity clone(s) will appear, and we terminate once we are able to find one high purity clone. -# Another update idea: identification of loh states is unaware of RDR. +# Another update idea: identification of loh states is unaware of RDR. # We can first find low-copy-number loh states first by thresholding RDR. If we can't find any, increase RDR threshold. # """ # return loh_states, is_B_lost, rdr_values, clones_hightumor -def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_candidate, single_total_bb_RD, MIN_SNPUMI=10, MAX_RDR=1, MIN_BAF_DEVIATION_RANGE=[0.25, 0.12], MIN_BINS_PER_STATE=10, MIN_BINS_ALL=25): +def identify_loh_per_clone( + single_X, + new_assignment, + pred_cnv, + p_binom, + normal_candidate, + single_total_bb_RD, + MIN_SNPUMI=10, + MAX_RDR=1, + MIN_BAF_DEVIATION_RANGE=[0.25, 0.12], + MIN_BINS_PER_STATE=10, + MIN_BINS_ALL=25, +): """ Attributes ---------- @@ -551,7 +720,7 @@ def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_c new_assignment : array, shape (n_spots,) Clone assignment for each spot. - + pred_cnv : array, shape (n_obs * n_clones) Copy number states across bins for each clone. @@ -572,38 +741,66 @@ def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_c n_obs = single_X.shape[0] n_clones = int(len(pred_cnv) / n_obs) n_states = p_binom.shape[0] - reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order='F') - + reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order="F") + # per-state RDR values # first get the normal baseline expression per spot per bin - simple_rdr_normal = np.sum(single_X[:, 0, (normal_candidate==True)], axis=1) + simple_rdr_normal = np.sum(single_X[:, 0, (normal_candidate == True)], axis=1) simple_rdr_normal = simple_rdr_normal / np.sum(simple_rdr_normal) - simple_single_base_nb_mean = simple_rdr_normal.reshape(-1,1) @ np.sum(single_X[:,0,:], axis=0).reshape(1,-1) + simple_single_base_nb_mean = simple_rdr_normal.reshape(-1, 1) @ np.sum( + single_X[:, 0, :], axis=0 + ).reshape(1, -1) # then aggregate to clones clone_index = [np.where(new_assignment == c)[0] for c in range(n_clones)] - X, base_nb_mean, _ = merge_pseudobulk_by_index(single_X, simple_single_base_nb_mean, np.zeros(simple_single_base_nb_mean.shape), clone_index) + X, base_nb_mean, _ = merge_pseudobulk_by_index( + single_X, + simple_single_base_nb_mean, + np.zeros(simple_single_base_nb_mean.shape), + clone_index, + ) rdr_values = [] for s in np.arange(n_states): - rdr_values.append( np.sum(X[:,0,:][reshaped_pred_cnv==s]) / np.sum(base_nb_mean[reshaped_pred_cnv==s]) ) + rdr_values.append( + np.sum(X[:, 0, :][reshaped_pred_cnv == s]) + / np.sum(base_nb_mean[reshaped_pred_cnv == s]) + ) rdr_values = np.array(rdr_values) # SNP-covering UMI per clone - clone_snpumi = np.array([np.sum(single_total_bb_RD[:,new_assignment==c]) for c in range(n_clones)]) + clone_snpumi = np.array( + [np.sum(single_total_bb_RD[:, new_assignment == c]) for c in range(n_clones)] + ) # clones that have a decent tumor proportion # for each clone, if the clones_hightumor-th BAF deviation is large enough - k_baf_deviation = np.sort( np.abs(p_binom[reshaped_pred_cnv, 0]-0.5), axis=0)[-MIN_BINS_ALL,:] + k_baf_deviation = np.sort(np.abs(p_binom[reshaped_pred_cnv, 0] - 0.5), axis=0)[ + -MIN_BINS_ALL, : + ] # LOH states - for threshold in np.arange(MIN_BAF_DEVIATION_RANGE[0], MIN_BAF_DEVIATION_RANGE[1]-0.01, -0.02): - clones_hightumor = np.where( (k_baf_deviation >= threshold) & (clone_snpumi >= MIN_SNPUMI*n_obs) )[0] + for threshold in np.arange( + MIN_BAF_DEVIATION_RANGE[0], MIN_BAF_DEVIATION_RANGE[1] - 0.01, -0.02 + ): + clones_hightumor = np.where( + (k_baf_deviation >= threshold) & (clone_snpumi >= MIN_SNPUMI * n_obs) + )[0] if len(clones_hightumor) == 0: continue if len(clones_hightumor) == n_clones: clones_hightumor = np.argsort(k_baf_deviation)[1:] # LOH states - loh_states = np.where( (np.abs(p_binom[:,0] - 0.5) > threshold) & (np.bincount(pred_cnv, minlength=n_states) >= MIN_BINS_PER_STATE) & (rdr_values <= MAX_RDR) )[0] - is_B_lost = (p_binom[loh_states,0] < 0.5) - if np.all([ np.sum(pd.Series(reshaped_pred_cnv[:,c]).isin(loh_states)) >= MIN_BINS_ALL for c in clones_hightumor ]): + loh_states = np.where( + (np.abs(p_binom[:, 0] - 0.5) > threshold) + & (np.bincount(pred_cnv, minlength=n_states) >= MIN_BINS_PER_STATE) + & (rdr_values <= MAX_RDR) + )[0] + is_B_lost = p_binom[loh_states, 0] < 0.5 + if np.all( + [ + np.sum(pd.Series(reshaped_pred_cnv[:, c]).isin(loh_states)) + >= MIN_BINS_ALL + for c in clones_hightumor + ] + ): print(f"threshold = {threshold}") print(f"clones with high tumor proportion: {clones_hightumor}") print(f"BAF deviation threshold = {threshold}, LOH states: {loh_states}") @@ -623,7 +820,18 @@ def identify_loh_per_clone(single_X, new_assignment, pred_cnv, p_binom, normal_c return loh_states, is_B_lost, rdr_values[loh_states], clones_hightumor -def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_cnv, loh_states, is_B_lost, rdr_values, clone_to_consider, smooth_mat=None, MIN_TOTAL=10): +def estimator_tumor_proportion( + single_X, + single_total_bb_RD, + assignments, + pred_cnv, + loh_states, + is_B_lost, + rdr_values, + clone_to_consider, + smooth_mat=None, + MIN_TOTAL=10, +): """ Attributes ---------- @@ -633,12 +841,12 @@ def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_c single_total_bb_RD : array, shape (n_obs, n_spots) Total allele count per bin per spot. - assignments : pd.DataFrame of size n_spots with columns "coarse", "combined" + assignments : pd.DataFrame of size n_spots with columns "coarse", "combined" Clone assignment for each spot. pred_cnv : array, shape (n_obs * n_clones) Copy number states across bins for each clone. - + loh_states, is_B_lost, rdr_values: array Copy number states and RDR values corresponding to LOH. @@ -646,29 +854,40 @@ def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_c ---------- 0.5 ( 1-theta ) / (theta * RDR + 1 - theta) = B_count / Total_count for each LOH state. """ + # def estimate_purity(T_loh, B_loh, rdr_values): # features =(T_loh / 2.0 + rdr_values * B_loh - B_loh)[T_loh>0].reshape(-1,1) # y = (T_loh / 2.0 - B_loh)[T_loh>0] # return np.linalg.lstsq(features, y, rcond=None)[0] def estimate_purity(T_loh, B_loh, rdr_values): idx = np.where(T_loh > 0)[0] - model = BAF_Binom(endog=B_loh[idx], exog=np.ones((len(idx),1)), weights=np.ones(len(idx)), exposure=T_loh[idx], offset=np.log(rdr_values[idx]), scaling=0.5) + model = BAF_Binom( + endog=B_loh[idx], + exog=np.ones((len(idx), 1)), + weights=np.ones(len(idx)), + exposure=T_loh[idx], + offset=np.log(rdr_values[idx]), + scaling=0.5, + ) res = model.fit(disp=False) return 1.0 / (1.0 + np.exp(res.params)) + # n_obs = single_X.shape[0] n_spots = single_X.shape[2] n_clones = int(len(pred_cnv) / n_obs) - reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order='F') + reshaped_pred_cnv = pred_cnv.reshape((n_obs, n_clones), order="F") - clone_mapping = assignments.groupby(['coarse', 'combined']).agg('first').reset_index() + clone_mapping = ( + assignments.groupby(["coarse", "combined"]).agg("first").reset_index() + ) tumor_proportion = np.zeros(n_spots) full_tumor_proportion = np.zeros((n_spots, n_clones)) for i in range(n_spots): # get adjacent spots for smoothing if smooth_mat is not None: - idx_adj = smooth_mat[i,:].nonzero()[1] + idx_adj = smooth_mat[i, :].nonzero()[1] else: idx_adj = np.array([i]) estimation_based_on_clones_single = np.ones(n_clones) * np.nan @@ -677,26 +896,74 @@ def estimate_purity(T_loh, B_loh, rdr_values): summed_T_smoothed = np.ones(n_clones) for c in clone_to_consider: # single - B_loh = np.array([ np.sum(single_X[:,1,i][reshaped_pred_cnv[:,c]==s]) if is_B_lost[j] else np.sum(single_total_bb_RD[:,i][reshaped_pred_cnv[:,c]==s]) - np.sum(single_X[:,1,i][reshaped_pred_cnv[:,c]==s]) for j,s in enumerate(loh_states)]) - T_loh = np.array([ np.sum(single_total_bb_RD[:,i][reshaped_pred_cnv[:,c]==s]) for s in loh_states]) + B_loh = np.array( + [ + ( + np.sum(single_X[:, 1, i][reshaped_pred_cnv[:, c] == s]) + if is_B_lost[j] + else np.sum( + single_total_bb_RD[:, i][reshaped_pred_cnv[:, c] == s] + ) + - np.sum(single_X[:, 1, i][reshaped_pred_cnv[:, c] == s]) + ) + for j, s in enumerate(loh_states) + ] + ) + T_loh = np.array( + [ + np.sum(single_total_bb_RD[:, i][reshaped_pred_cnv[:, c] == s]) + for s in loh_states + ] + ) if np.all(T_loh == 0): continue - estimation_based_on_clones_single[c] = estimate_purity(T_loh, B_loh, rdr_values) + estimation_based_on_clones_single[c] = estimate_purity( + T_loh, B_loh, rdr_values + ) summed_T_single[c] = np.sum(T_loh) # smoothed - B_loh = np.array([ np.sum(single_X[:,1,idx_adj][reshaped_pred_cnv[:,c]==s]) if is_B_lost[j] else np.sum(single_total_bb_RD[:,idx_adj][reshaped_pred_cnv[:,c]==s]) - np.sum(single_X[:,1,idx_adj][reshaped_pred_cnv[:,c]==s]) for j,s in enumerate(loh_states)]) - T_loh = np.array([ np.sum(single_total_bb_RD[:,idx_adj][reshaped_pred_cnv[:,c]==s]) for s in loh_states]) + B_loh = np.array( + [ + ( + np.sum(single_X[:, 1, idx_adj][reshaped_pred_cnv[:, c] == s]) + if is_B_lost[j] + else np.sum( + single_total_bb_RD[:, idx_adj][reshaped_pred_cnv[:, c] == s] + ) + - np.sum(single_X[:, 1, idx_adj][reshaped_pred_cnv[:, c] == s]) + ) + for j, s in enumerate(loh_states) + ] + ) + T_loh = np.array( + [ + np.sum(single_total_bb_RD[:, idx_adj][reshaped_pred_cnv[:, c] == s]) + for s in loh_states + ] + ) if np.all(T_loh == 0): continue - estimation_based_on_clones_smoothed[c] = estimate_purity(T_loh, B_loh, rdr_values) + estimation_based_on_clones_smoothed[c] = estimate_purity( + T_loh, B_loh, rdr_values + ) summed_T_smoothed[c] = np.sum(T_loh) - full_tumor_proportion[i,:] = estimation_based_on_clones_single - if (assignments.combined.values[i] in clone_to_consider) and summed_T_single[assignments.combined.values[i]] >= MIN_TOTAL: - tumor_proportion[i] = estimation_based_on_clones_single[ assignments.combined.values[i] ] - elif (assignments.combined.values[i] in clone_to_consider) and summed_T_smoothed[assignments.combined.values[i]] >= MIN_TOTAL: - tumor_proportion[i] = estimation_based_on_clones_smoothed[ assignments.combined.values[i] ] + full_tumor_proportion[i, :] = estimation_based_on_clones_single + if (assignments.combined.values[i] in clone_to_consider) and summed_T_single[ + assignments.combined.values[i] + ] >= MIN_TOTAL: + tumor_proportion[i] = estimation_based_on_clones_single[ + assignments.combined.values[i] + ] + elif ( + assignments.combined.values[i] in clone_to_consider + ) and summed_T_smoothed[assignments.combined.values[i]] >= MIN_TOTAL: + tumor_proportion[i] = estimation_based_on_clones_smoothed[ + assignments.combined.values[i] + ] elif not assignments.combined.values[i] in clone_to_consider: - tumor_proportion[i] = estimation_based_on_clones_single[np.argmax(summed_T_single)] + tumor_proportion[i] = estimation_based_on_clones_single[ + np.argmax(summed_T_single) + ] else: tumor_proportion[i] = np.nan diff --git a/src/calicost/utils_phase_switch.py b/src/calicost/utils_phase_switch.py index aed6e11..2b30fa3 100644 --- a/src/calicost/utils_phase_switch.py +++ b/src/calicost/utils_phase_switch.py @@ -15,7 +15,7 @@ def get_position_cM_table(chr_pos_vector, geneticmap_file): """ df = pd.read_csv(geneticmap_file, header=0, sep="\t") # remove chrX - df = df[df.chrom.isin( [f"chr{i}" for i in range(1,23)] )] + df = df[df.chrom.isin([f"chr{i}" for i in range(1, 23)])] # check the chromosome names if not ("chr" in str(chr_pos_vector[0][0])): df["chrom"] = [int(x[3:]) for x in df.chrom] @@ -28,22 +28,28 @@ def get_position_cM_table(chr_pos_vector, geneticmap_file): # find the centimorgan values (interpolate between (k-1)-th and k-th rows in centimorgan tables) position_cM = np.ones(len(chr_pos_vector)) * np.nan k = 0 - for i,x in enumerate(chr_pos_vector): + for i, x in enumerate(chr_pos_vector): chrname = x[0] pos = x[1] - while k < len(ref_chrom) and (ref_chrom[k] < chrname or (ref_chrom[k] == chrname and ref_pos[k] < pos)): + while k < len(ref_chrom) and ( + ref_chrom[k] < chrname or (ref_chrom[k] == chrname and ref_pos[k] < pos) + ): k += 1 if k < len(ref_chrom) and ref_chrom[k] == chrname and ref_pos[k] >= pos: - if k > 0 and ref_chrom[k-1] == chrname: - position_cM[i] = ref_cm[k-1] + (pos - ref_pos[k-1]) / (ref_pos[k] - ref_pos[k-1]) * (ref_cm[k] - ref_cm[k-1]) + if k > 0 and ref_chrom[k - 1] == chrname: + position_cM[i] = ref_cm[k - 1] + (pos - ref_pos[k - 1]) / ( + ref_pos[k] - ref_pos[k - 1] + ) * (ref_cm[k] - ref_cm[k - 1]) else: position_cM[i] = (pos - 0) / (ref_pos[k] - 0) * (ref_cm[k] - 0) else: - position_cM[i] = ref_cm[k-1] + position_cM[i] = ref_cm[k - 1] return position_cM -def compute_phase_switch_probability_position(position_cM, chr_pos_vector, nu = 1, min_prob=1e-20): +def compute_phase_switch_probability_position( + position_cM, chr_pos_vector, nu=1, min_prob=1e-20 +): """ Attributes ---------- @@ -54,9 +60,13 @@ def compute_phase_switch_probability_position(position_cM, chr_pos_vector, nu = list of (chr, pos) pairs of SNPs. It is used to identify start of a new chr. """ phase_switch_prob = np.ones(len(position_cM)) * 1e-20 - for i,cm in enumerate(position_cM[:-1]): - cm_next = position_cM[i+1] - if np.isnan(cm) or np.isnan(cm_next) or chr_pos_vector[i][0] != chr_pos_vector[i+1][0]: + for i, cm in enumerate(position_cM[:-1]): + cm_next = position_cM[i + 1] + if ( + np.isnan(cm) + or np.isnan(cm_next) + or chr_pos_vector[i][0] != chr_pos_vector[i + 1][0] + ): continue assert cm <= cm_next d = cm_next - cm @@ -70,25 +80,42 @@ def duplicate_RD(chr_baf, pos_baf, chr_rd, start_rd, end_rd, tumor_rd, normal_rd normal_reads = np.ones(len(chr_baf)) * np.nan idx = 0 for i in range(len(chr_baf)): - while idx < len(chr_rd) and (chr_rd[idx] < chr_baf[i] or (chr_rd[idx] == chr_baf[i] and end_rd[idx] < pos_baf[i])): + while idx < len(chr_rd) and ( + chr_rd[idx] < chr_baf[i] + or (chr_rd[idx] == chr_baf[i] and end_rd[idx] < pos_baf[i]) + ): idx += 1 - if idx < len(chr_rd) and chr_rd[idx] == chr_baf[i] and end_rd[idx] >= pos_baf[i] and start_rd[idx] <= pos_baf[i]: + if ( + idx < len(chr_rd) + and chr_rd[idx] == chr_baf[i] + and end_rd[idx] >= pos_baf[i] + and start_rd[idx] <= pos_baf[i] + ): tumor_reads[i] = tumor_rd[idx] normal_reads[i] = normal_rd[idx] return tumor_reads, normal_reads -def generate_input_from_HATCHet(hatchetdir, output_picklefile, rdrfile="abin/bulk.bb", baffile="baf/bulk.1bed", phasefile="phase/phased.vcf.gz", with_chr_prefix=True): +def generate_input_from_HATCHet( + hatchetdir, + output_picklefile, + rdrfile="abin/bulk.bb", + baffile="baf/bulk.1bed", + phasefile="phase/phased.vcf.gz", + with_chr_prefix=True, +): if with_chr_prefix: unique_chrs = [f"chr{i}" for i in range(1, 23)] else: unique_chrs = np.arange(1, 23) - + ### load hatchet outputs ### if Path(output_picklefile).exists(): # RDR file df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t") - df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True) + df_all.iloc[:, 0] = pd.Categorical( + df_all.iloc[:, 0], categories=unique_chrs, ordered=True + ) df_all.sort_values(by=["#CHR", "START"], inplace=True) # samples unique_samples = np.unique(df_all["SAMPLE"]) @@ -97,29 +124,61 @@ def generate_input_from_HATCHet(hatchetdir, output_picklefile, rdrfile="abin/bul else: # RDR file df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t") - df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True) + df_all.iloc[:, 0] = pd.Categorical( + df_all.iloc[:, 0], categories=unique_chrs, ordered=True + ) df_all.sort_values(by=["#CHR", "START"], inplace=True) # samples unique_samples = np.unique(df_all["SAMPLE"]) + # allele counts for individual SNPs def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples): - tmpdf = pd.read_csv(f"{hatchetdir}/{baffile}", header=None, sep="\t", names=["CHR", "POS", "SAMPLE", "REF", "ALT"]) + tmpdf = pd.read_csv( + f"{hatchetdir}/{baffile}", + header=None, + sep="\t", + names=["CHR", "POS", "SAMPLE", "REF", "ALT"], + ) df_baf = [] for chrname in unique_chrs: tmp = tmpdf[tmpdf.CHR == chrname] - list_pos = [set(list(tmp[tmp["SAMPLE"] == s].POS)) for s in unique_samples] # SNP set of each individual sample - shared_pos = set.intersection(*list_pos) # SNPs that are shared across samples - index = np.array([i for i in range(tmp.shape[0]) if tmp.iloc[i,1] in shared_pos]) - tmp = tmp.iloc[index,:] + list_pos = [ + set(list(tmp[tmp["SAMPLE"] == s].POS)) for s in unique_samples + ] # SNP set of each individual sample + shared_pos = set.intersection( + *list_pos + ) # SNPs that are shared across samples + index = np.array( + [i for i in range(tmp.shape[0]) if tmp.iloc[i, 1] in shared_pos] + ) + tmp = tmp.iloc[index, :] tmp.sort_values(by=["POS", "SAMPLE"], inplace=True) - df_baf.append( tmp ) + df_baf.append(tmp) df_baf = pd.concat(df_baf, ignore_index=True) return df_baf + df_baf = load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples) # reference-based phasing results - df_phase = pd.read_csv(f"{hatchetdir}/{phasefile}", comment="#", sep="\t", \ - names=["CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLENAME"]) - df_phase = df_phase[(df_phase.SAMPLENAME=="0|1") | (df_phase.SAMPLENAME=="1|0")] + df_phase = pd.read_csv( + f"{hatchetdir}/{phasefile}", + comment="#", + sep="\t", + names=[ + "CHR", + "POS", + "ID", + "REF", + "ALT", + "QUAL", + "FILTER", + "INFO", + "FORMAT", + "SAMPLENAME", + ], + ) + df_phase = df_phase[ + (df_phase.SAMPLENAME == "0|1") | (df_phase.SAMPLENAME == "1|0") + ] print("HATCHet dataframes loaded.") ### gather phased BAF info ### @@ -127,13 +186,17 @@ def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples): for chrname in unique_chrs: tmpdf_baf = df_baf[df_baf.CHR == chrname] tmpdf_phase = df_phase[df_phase.CHR == chrname][["POS", "SAMPLENAME"]] - tmpdf_baf = tmpdf_baf.join( tmpdf_phase.set_index("POS"), on="POS") + tmpdf_baf = tmpdf_baf.join(tmpdf_phase.set_index("POS"), on="POS") tmpdf_baf = tmpdf_baf[~tmpdf_baf.SAMPLENAME.isnull()] - tmpdf_baf["B_count"] = np.where(tmpdf_baf.SAMPLENAME=="0|1", tmpdf_baf.REF, tmpdf_baf.ALT) + tmpdf_baf["B_count"] = np.where( + tmpdf_baf.SAMPLENAME == "0|1", tmpdf_baf.REF, tmpdf_baf.ALT + ) tmpdf_baf["DP"] = tmpdf_baf.REF + tmpdf_baf.ALT - df_combined_baf.append( tmpdf_baf ) + df_combined_baf.append(tmpdf_baf) df_combined_baf = pd.concat(df_combined_baf, ignore_index=True) - df_combined_baf.iloc[:,0] = pd.Categorical(df_combined_baf.CHR, categories=unique_chrs, ordered=True) + df_combined_baf.iloc[:, 0] = pd.Categorical( + df_combined_baf.CHR, categories=unique_chrs, ordered=True + ) df_combined_baf.sort_values(by=["CHR", "POS"], inplace=True) df_baf = df_combined_baf @@ -143,51 +206,81 @@ def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples): for s in unique_samples: index = np.where(df_baf["SAMPLE"] == s)[0] index_rd = np.where(df_all["SAMPLE"] == s)[0] - tumor_reads, normal_reads = duplicate_RD(np.array(df_baf.iloc[index,:].CHR.cat.codes), np.array(df_baf.iloc[index,:].POS), \ - np.array(df_all.iloc[index_rd,0].cat.codes), np.array(df_all.iloc[index_rd,:].START), np.array(df_all.iloc[index_rd,:].END), \ - np.array(df_all.iloc[index_rd,:].TOTAL_READS), np.array(df_all.iloc[index_rd,:].NORMAL_READS)) + tumor_reads, normal_reads = duplicate_RD( + np.array(df_baf.iloc[index, :].CHR.cat.codes), + np.array(df_baf.iloc[index, :].POS), + np.array(df_all.iloc[index_rd, 0].cat.codes), + np.array(df_all.iloc[index_rd, :].START), + np.array(df_all.iloc[index_rd, :].END), + np.array(df_all.iloc[index_rd, :].TOTAL_READS), + np.array(df_all.iloc[index_rd, :].NORMAL_READS), + ) df_baf.iloc[index, -2] = tumor_reads df_baf.iloc[index, -1] = normal_reads + # remove SNP positions with TOTAL_READS=NAN (if NAN occurs in one sample, remove the corresponding SNPs for the other samples too) def remove_nan_RD(df_baf): - idx_nan = np.where(np.logical_or( df_baf.TOTAL_READS.isnull(), df_baf.NORMAL_READS.isnull() ))[0] + idx_nan = np.where( + np.logical_or(df_baf.TOTAL_READS.isnull(), df_baf.NORMAL_READS.isnull()) + )[0] chr = np.array(df_baf.CHR) pos = np.array(df_baf.POS) chr_pos = np.array([f"{chr[i]}_{pos[i]}" for i in range(len(chr))]) nan_chr_pos = set(list(chr_pos[idx_nan])) - idx_remain = np.array([i for i,snpid in enumerate(chr_pos) if not (snpid in nan_chr_pos)]) + idx_remain = np.array( + [i for i, snpid in enumerate(chr_pos) if not (snpid in nan_chr_pos)] + ) df_baf = df_baf.iloc[idx_remain, :] return df_baf + df_baf = remove_nan_RD(df_baf) df_baf.to_pickle(output_picklefile) print("SNP-level BAF and bin-level RDR paired up.") ### from BAF, RDR table, generate HMM input ### - lengths = np.array([ np.sum(np.logical_and(df_baf["CHR"]==chrname, df_baf["SAMPLE"]==unique_samples[0])) for chrname in unique_chrs ]) + lengths = np.array( + [ + np.sum( + np.logical_and( + df_baf["CHR"] == chrname, df_baf["SAMPLE"] == unique_samples[0] + ) + ) + for chrname in unique_chrs + ] + ) - X = np.zeros(( np.sum(lengths), 2, len(unique_samples) )) - base_nb_mean = np.zeros((np.sum(lengths), len(unique_samples) )) - total_bb_RD = np.zeros((np.sum(lengths), len(unique_samples) )) + X = np.zeros((np.sum(lengths), 2, len(unique_samples))) + base_nb_mean = np.zeros((np.sum(lengths), len(unique_samples))) + total_bb_RD = np.zeros((np.sum(lengths), len(unique_samples))) - for k,s in enumerate(unique_samples): + for k, s in enumerate(unique_samples): df = df_baf[df_baf["SAMPLE"] == s] - X[:,0,k] = df.TOTAL_READS - X[:,1,k] = df.B_count + X[:, 0, k] = df.TOTAL_READS + X[:, 1, k] = df.B_count - total_bb_RD[:,k] = np.array(df.DP) + total_bb_RD[:, k] = np.array(df.DP) df2 = df_all[df_all["SAMPLE"] == s] - base_nb_mean[:,k] = np.array(df.NORMAL_READS / np.sum(df2.NORMAL_READS) * np.sum(df2.TOTAL_READS)) + base_nb_mean[:, k] = np.array( + df.NORMAL_READS / np.sum(df2.NORMAL_READS) * np.sum(df2.TOTAL_READS) + ) # site-wise transition matrix - chr_pos_vector = [(df_baf.CHR.iloc[i], df_baf.POS.iloc[i]) for i in np.where(df_baf["SAMPLE"]==unique_samples[0])[0]] + chr_pos_vector = [ + (df_baf.CHR.iloc[i], df_baf.POS.iloc[i]) + for i in np.where(df_baf["SAMPLE"] == unique_samples[0])[0] + ] position_cM = get_position_cM_table(chr_pos_vector) - phase_switch_prob = compute_phase_switch_probability_position(position_cM, chr_pos_vector) + phase_switch_prob = compute_phase_switch_probability_position( + position_cM, chr_pos_vector + ) log_sitewise_transmat = np.log(phase_switch_prob) return X, lengths, base_nb_mean, total_bb_RD, log_sitewise_transmat -def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, clone_pred2, p_binom2): +def distance_between_p_binom( + state_pred1, clone_pred1, p_binom1, state_pred2, clone_pred2, p_binom2 +): import networkx as nx # matching predicted CNV states @@ -201,7 +294,22 @@ def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, cl # tmp = nx.max_weight_matching(G) # state_matching = {x[0]:x[1] for x in tmp} # state_matching.update( {x[1]:x[0] for x in tmp} ) - G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(state_pred1) - np.sum(np.logical_and(state_pred1==uniq_pred1[i], state_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] ) + G.add_weighted_edges_from( + [ + ( + f"A{i}", + f"B{j}", + len(state_pred1) + - np.sum( + np.logical_and( + state_pred1 == uniq_pred1[i], state_pred2 == uniq_pred2[j] + ) + ), + ) + for i in uniq_pred1 + for j in uniq_pred2 + ] + ) state_matching = nx.bipartite.minimum_weight_full_matching(G) # matching predicted clones @@ -215,16 +323,38 @@ def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, cl # tmp = nx.max_weight_matching(G) # clone_matching = {x[0]:x[1] for x in tmp} # clone_matching.update( {x[1]:x[0] for x in tmp} ) - G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(clone_pred1) - np.sum(np.logical_and(clone_pred1==uniq_pred1[i], clone_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] ) + G.add_weighted_edges_from( + [ + ( + f"A{i}", + f"B{j}", + len(clone_pred1) + - np.sum( + np.logical_and( + clone_pred1 == uniq_pred1[i], clone_pred2 == uniq_pred2[j] + ) + ), + ) + for i in uniq_pred1 + for j in uniq_pred2 + ] + ) clone_matching = nx.bipartite.minimum_weight_full_matching(G) # l2 distance between corresponding CNV at corresponding clone # reorder p_binom2 based on state_matching and clone_matching - reorder_p_binom2 = p_binom2[:, np.array([ int(clone_matching[f"A{i}"][1:]) for i in range(n_clones)])] - reorder_p_binom2 = reorder_p_binom2[np.array([ int(state_matching[f"A{i}"][1:]) for i in range(n_states) ]), :] + reorder_p_binom2 = p_binom2[ + :, np.array([int(clone_matching[f"A{i}"][1:]) for i in range(n_clones)]) + ] + reorder_p_binom2 = reorder_p_binom2[ + np.array([int(state_matching[f"A{i}"][1:]) for i in range(n_states)]), : + ] l2 = 0 for i in range(p_binom1.shape[0]): - l2 += min( np.sum(np.square(p_binom1[i,:] - reorder_p_binom2[i,:])), np.sum(np.square(p_binom1[i,:] - 1 + reorder_p_binom2[i,:])) ) + l2 += min( + np.sum(np.square(p_binom1[i, :] - reorder_p_binom2[i, :])), + np.sum(np.square(p_binom1[i, :] - 1 + reorder_p_binom2[i, :])), + ) return l2 @@ -235,14 +365,14 @@ def get_intervals(pred_cnv): while s < len(pred_cnv): t = np.where(pred_cnv[s:] != pred_cnv[s])[0] if len(t) == 0: - intervals.append( (s, len(pred_cnv)) ) - labs.append( pred_cnv[s] ) + intervals.append((s, len(pred_cnv))) + labs.append(pred_cnv[s]) s = len(pred_cnv) else: t = t[0] - intervals.append( (s,s+t) ) - labs.append( pred_cnv[s] ) - s = s+t + intervals.append((s, s + t)) + labs.append(pred_cnv[s]) + s = s + t return intervals, labs @@ -256,14 +386,14 @@ def get_intervals_nd(pred_cnv): while s < len(pred_cnv): t = np.where(np.any(pred_cnv[s:] != pred_cnv[s], axis=1))[0] if len(t) == 0: - intervals.append( (s, len(pred_cnv)) ) - labs.append( pred_cnv[s] ) + intervals.append((s, len(pred_cnv))) + labs.append(pred_cnv[s]) s = len(pred_cnv) else: t = t[0] - intervals.append( (s,s+t) ) - labs.append( pred_cnv[s] ) - s = s+t + intervals.append((s, s + t)) + labs.append(pred_cnv[s]) + s = s + t return intervals, labs @@ -276,14 +406,14 @@ def postbinning_forvisual(X, base_nb_mean, total_bb_RD, lengths, res, binsize=2) nextlen = lengths[chrname] s = 0 while s < X.shape[0]: - t = min(s+binsize, nextlen) - intervals.append( [s,t] ) + t = min(s + binsize, nextlen) + intervals.append([s, t]) s = t if s >= nextlen: if s < X.shape[0]: chrname += 1 nextlen += lengths[chrname] - bin_lengths.append( len(intervals) ) + bin_lengths.append(len(intervals)) bin_lengths = np.array(bin_lengths) bin_lengths[1:] = bin_lengths[1:] - bin_lengths[:-1] @@ -295,11 +425,21 @@ def postbinning_forvisual(X, base_nb_mean, total_bb_RD, lengths, res, binsize=2) bin_total_bb_RD = np.zeros((len(intervals), total_bb_RD.shape[1]), dtype=int) bin_pred_cnv = np.zeros(len(intervals), dtype=int) for i, intvl in enumerate(intervals): - s,t = intvl - bin_X[i,0,:] = np.sum(X[s:t, 0,:], axis=0) - bin_X[i,1,:] = np.sum( phase_prob[s:t].dot(X[s:t, 1,:]) + (1-phase_prob[s:t]).dot(total_bb_RD[s:t,:] - X[s:t,1,:]) ) - bin_base_nb_mean[i,:] = np.sum(base_nb_mean[s:t,:], axis=0) - bin_total_bb_RD[i,:] = np.sum(total_bb_RD[s:t,:], axis=0) + s, t = intvl + bin_X[i, 0, :] = np.sum(X[s:t, 0, :], axis=0) + bin_X[i, 1, :] = np.sum( + phase_prob[s:t].dot(X[s:t, 1, :]) + + (1 - phase_prob[s:t]).dot(total_bb_RD[s:t, :] - X[s:t, 1, :]) + ) + bin_base_nb_mean[i, :] = np.sum(base_nb_mean[s:t, :], axis=0) + bin_total_bb_RD[i, :] = np.sum(total_bb_RD[s:t, :], axis=0) bin_pred_cnv[i] = res["pred_cnv"][s] - - return bin_X, bin_base_nb_mean, bin_total_bb_RD, bin_pred_cnv, bin_lengths, intervals \ No newline at end of file + + return ( + bin_X, + bin_base_nb_mean, + bin_total_bb_RD, + bin_pred_cnv, + bin_lengths, + intervals, + ) diff --git a/src/calicost/utils_plotting.py b/src/calicost/utils_plotting.py index 079278a..4544e76 100644 --- a/src/calicost/utils_plotting.py +++ b/src/calicost/utils_plotting.py @@ -1,4 +1,3 @@ - import sys import argparse @@ -22,409 +21,1098 @@ def get_full_palette(): palette = {} - palette.update({(0, 0) : 'darkblue'}) - palette.update({(1, 0) : 'lightblue'}) - palette.update({(1, 1) : 'lightgray', (2, 0) : 'dimgray'}) - palette.update({(2, 1) : 'lightgoldenrodyellow', (3, 0) : 'gold'}) + palette.update({(0, 0): "darkblue"}) + palette.update({(1, 0): "lightblue"}) + palette.update({(1, 1): "lightgray", (2, 0): "dimgray"}) + palette.update({(2, 1): "lightgoldenrodyellow", (3, 0): "gold"}) # palette.update({(2, 1) : 'greenyellow', (3, 0) : 'darkseagreen'}) - palette.update({(2, 2) : 'navajowhite', (3, 1) : 'orange', (4, 0) : 'darkorange'}) - palette.update({(3, 2) : 'salmon', (4, 1) : 'red', (5, 0) : 'darkred'}) - palette.update({(3, 3) : 'plum', (4, 2) : 'orchid', (5, 1) : 'purple', (6, 0) : 'indigo'}) - ordered_acn = [(0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), \ - (2, 2), (3, 1), (4, 0), (3, 2), (4, 1), (5, 0), \ - (3, 3), (4, 2), (5, 1), (6, 0)] + palette.update({(2, 2): "navajowhite", (3, 1): "orange", (4, 0): "darkorange"}) + palette.update({(3, 2): "salmon", (4, 1): "red", (5, 0): "darkred"}) + palette.update( + {(3, 3): "plum", (4, 2): "orchid", (5, 1): "purple", (6, 0): "indigo"} + ) + ordered_acn = [ + (0, 0), + (1, 0), + (1, 1), + (2, 0), + (2, 1), + (3, 0), + (2, 2), + (3, 1), + (4, 0), + (3, 2), + (4, 1), + (5, 0), + (3, 3), + (4, 2), + (5, 1), + (6, 0), + ] return palette, ordered_acn -def plot_acn(cn_file, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, add_arrow=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True): +def plot_acn( + cn_file, + ax_handle, + clone_ids=None, + clone_names=None, + add_chrbar=True, + add_arrow=True, + chrbar_thickness=0.1, + add_legend=True, + remove_xticks=True, +): # full color palette - palette,_ = get_full_palette() + palette, _ = get_full_palette() # read CN profiles df_cnv = pd.read_csv(cn_file, header=0, sep="\t") - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) print(final_clone_ids) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) found = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) found += list(zip(major, minor)) found = list(set(found)) found.sort() # map CN to single digit number - map_cn = {x:i for i,x in enumerate(found)} + map_cn = {x: i for i, x in enumerate(found)} cnv_mapped = [] ploidy = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] ) - ploidy.append( np.mean(major + minor) ) - cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]) - ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids]) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))]) + ploidy.append(np.mean(major + minor)) + cnv_mapped = pd.DataFrame( + np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids] + ) + ploidy = pd.DataFrame( + np.around(np.array(ploidy), decimals=2).reshape(-1, 1), + index=[f"clone {cid}" for cid in final_clone_ids], + ) chr_ids = df_cnv.CHR colors = [palette[c] for c in found] if clone_ids is None: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids] - rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)]) - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(final_clone_ids) + ], + ) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) else: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids] if clone_names is None: - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ], + ) else: - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]) - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[ + f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ], + ) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) # indicate allele switches if add_arrow: if clone_ids is None: # find regions where there exist both clones with A > B and clones with A < B - has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0) - has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0) - intervals, labs = get_intervals( (has_up & has_down) ) + has_up = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values + for cid in final_clone_ids + ] + ), + axis=0, + ) + has_down = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values + for cid in final_clone_ids + ] + ), + axis=0, + ) + intervals, labs = get_intervals((has_up & has_down)) # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow for i in range(len(intervals)): if not labs[i]: continue - for c,cid in enumerate(final_clone_ids): + for c, cid in enumerate(final_clone_ids): y1 = c - y2 = c+1 + y2 = c + 1 # up-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + > df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y2 + 0.1 * y1, + dx=0, + dy=0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) # down-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + < df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1]-sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y1 + 0.1 * y2, + dx=0, + dy=-0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) else: # find regions where there exist both clones with A > B and clones with A < B - has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0) - has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0) - intervals, labs = get_intervals( (has_up & has_down) ) + has_up = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values + for cid in clone_ids + ] + ), + axis=0, + ) + has_down = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values + for cid in clone_ids + ] + ), + axis=0, + ) + intervals, labs = get_intervals((has_up & has_down)) # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow for i in range(len(intervals)): if not labs[i]: continue - for c,cid in enumerate(clone_ids): + for c, cid in enumerate(clone_ids): y1 = c - y2 = c+1 + y2 = c + 1 # up-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + > df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y2 + 0.1 * y1, + dx=0, + dy=0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) # down-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + < df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y1 + 0.1 * y2, + dx=0, + dy=-0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) if add_chrbar: # add chr color - chr_palette = cycle(['#525252', '#969696', '#cccccc']) - lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)} + chr_palette = cycle(["#525252", "#969696", "#cccccc"]) + lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)} col_colors = chr_ids.map(lut) for i, color in enumerate(col_colors): - ax_handle.add_patch(plt.Rectangle(xy=(i, 1.01), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True)) + ax_handle.add_patch( + plt.Rectangle( + xy=(i, 1.01), + width=1, + height=chrbar_thickness, + color=color, + lw=0, + transform=ax_handle.get_xaxis_transform(), + clip_on=False, + rasterized=True, + ) + ) for c in np.unique(chr_ids.values): interval = np.where(chr_ids.values == c)[0] mid = np.percentile(interval, 45) - ax_handle.text(mid-10, 1.04, str(c), transform=ax_handle.get_xaxis_transform()) + ax_handle.text( + mid - 10, 1.04, str(c), transform=ax_handle.get_xaxis_transform() + ) ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0) if remove_xticks: ax_handle.set_xticks([]) if add_legend: - a00 = plt.arrow(0,0, 0,0, color='darkblue') - a10 = plt.arrow(0,0, 0,0, color='lightblue') - a11 = plt.arrow(0,0, 0,0, color='lightgray') - a20 = plt.arrow(0,0, 0,0, color='dimgray') - a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow') - a30 = plt.arrow(0,0, 0,0, color='gold') - a22 = plt.arrow(0,0, 0,0, color='navajowhite') - a31 = plt.arrow(0,0, 0,0, color='orange') - a40 = plt.arrow(0,0, 0,0, color='darkorange') - a32 = plt.arrow(0,0, 0,0, color='salmon') - a41 = plt.arrow(0,0, 0,0, color='red') - a50 = plt.arrow(0,0, 0,0, color='darkred') - a33 = plt.arrow(0,0, 0,0, color='plum') - a42 = plt.arrow(0,0, 0,0, color='orchid') - a51 = plt.arrow(0,0, 0,0, color='purple') - a60 = plt.arrow(0,0, 0,0, color='indigo') - ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \ - ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \ - '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1)) + a00 = plt.arrow(0, 0, 0, 0, color="darkblue") + a10 = plt.arrow(0, 0, 0, 0, color="lightblue") + a11 = plt.arrow(0, 0, 0, 0, color="lightgray") + a20 = plt.arrow(0, 0, 0, 0, color="dimgray") + a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow") + a30 = plt.arrow(0, 0, 0, 0, color="gold") + a22 = plt.arrow(0, 0, 0, 0, color="navajowhite") + a31 = plt.arrow(0, 0, 0, 0, color="orange") + a40 = plt.arrow(0, 0, 0, 0, color="darkorange") + a32 = plt.arrow(0, 0, 0, 0, color="salmon") + a41 = plt.arrow(0, 0, 0, 0, color="red") + a50 = plt.arrow(0, 0, 0, 0, color="darkred") + a33 = plt.arrow(0, 0, 0, 0, color="plum") + a42 = plt.arrow(0, 0, 0, 0, color="orchid") + a51 = plt.arrow(0, 0, 0, 0, color="purple") + a60 = plt.arrow(0, 0, 0, 0, color="indigo") + ax_handle.legend( + [ + a00, + a10, + a11, + a20, + a21, + a30, + a22, + a31, + a40, + a32, + a41, + a50, + a33, + a42, + a51, + a60, + ], + [ + "(0, 0)", + "(1, 0)", + "(1, 1)", + "(2, 0)", + "(2, 1)", + "(3, 0)", + "(2, 2)", + "(3, 1)", + "(4, 0)", + "(3, 2)", + "(4, 1)", + "(5, 0)", + "(3, 3)", + "(4, 2)", + "(5, 1)", + "(6, 0)", + ], + ncol=2, + loc="upper left", + bbox_to_anchor=(1, 1), + ) return ax_handle -def plot_acn_from_df(df_cnv, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, add_arrow=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True, rasterized=True): +def plot_acn_from_df( + df_cnv, + ax_handle, + clone_ids=None, + clone_names=None, + add_chrbar=True, + add_arrow=True, + chrbar_thickness=0.1, + add_legend=True, + remove_xticks=True, + rasterized=True, +): # full color palette - palette,_ = get_full_palette() + palette, _ = get_full_palette() # read CN profiles - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) print(final_clone_ids) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) found = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) found += list(zip(major, minor)) found = list(set(found)) found.sort() # map CN to single digit number - map_cn = {x:i for i,x in enumerate(found)} + map_cn = {x: i for i, x in enumerate(found)} cnv_mapped = [] ploidy = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] ) - ploidy.append( np.mean(major + minor) ) - cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]) - ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids]) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))]) + ploidy.append(np.mean(major + minor)) + cnv_mapped = pd.DataFrame( + np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids] + ) + ploidy = pd.DataFrame( + np.around(np.array(ploidy), decimals=2).reshape(-1, 1), + index=[f"clone {cid}" for cid in final_clone_ids], + ) chr_ids = df_cnv.CHR colors = [palette[c] for c in found] if clone_ids is None: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids] - rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)]) - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(final_clone_ids) + ], + ) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=rasterized, + ax=ax_handle, + ) else: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids] if clone_names is None: - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ], + ) else: - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]) - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[ + f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ], + ) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=rasterized, + ax=ax_handle, + ) # indicate allele switches if add_arrow: if clone_ids is None: # find regions where there exist both clones with A > B and clones with A < B - has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0) - has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0) - intervals, labs = get_intervals( (has_up & has_down) ) + has_up = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values + for cid in final_clone_ids + ] + ), + axis=0, + ) + has_down = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values + for cid in final_clone_ids + ] + ), + axis=0, + ) + intervals, labs = get_intervals((has_up & has_down)) # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow for i in range(len(intervals)): if not labs[i]: continue - for c,cid in enumerate(final_clone_ids): + for c, cid in enumerate(final_clone_ids): y1 = c - y2 = c+1 + y2 = c + 1 # up-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + > df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y2 + 0.1 * y1, + dx=0, + dy=0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) # down-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + < df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1]-sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y1 + 0.1 * y2, + dx=0, + dy=-0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) else: # find regions where there exist both clones with A > B and clones with A < B - has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0) - has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0) - intervals, labs = get_intervals( (has_up & has_down) ) + has_up = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values + for cid in clone_ids + ] + ), + axis=0, + ) + has_down = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values + for cid in clone_ids + ] + ), + axis=0, + ) + intervals, labs = get_intervals((has_up & has_down)) # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow for i in range(len(intervals)): if not labs[i]: continue - for c,cid in enumerate(clone_ids): + for c, cid in enumerate(clone_ids): y1 = c - y2 = c+1 + y2 = c + 1 # up-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + > df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y2+0.1*y1, dx=0, dy=0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y2 + 0.1 * y1, + dx=0, + dy=0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) # down-arrow - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + < df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black") - ax_handle.arrow(x=intervals[i][0]+np.mean(sub_int), y=0.9*y1+0.1*y2, dx=0, dy=-0.7*(y1-y2), head_width=0.3*(sub_int[1] - sub_int[0]), head_length=0.1*np.abs(y1-y2), fc="black") + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + ) + ax_handle.arrow( + x=intervals[i][0] + np.mean(sub_int), + y=0.9 * y1 + 0.1 * y2, + dx=0, + dy=-0.7 * (y1 - y2), + head_width=0.3 * (sub_int[1] - sub_int[0]), + head_length=0.1 * np.abs(y1 - y2), + fc="black", + ) if add_chrbar: # add chr color - chr_palette = cycle(['#525252', '#969696', '#cccccc']) - lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)} + chr_palette = cycle(["#525252", "#969696", "#cccccc"]) + lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)} col_colors = chr_ids.map(lut) for i, color in enumerate(col_colors): - ax_handle.add_patch(plt.Rectangle(xy=(i, 1 + 0.02*chrbar_thickness), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=rasterized)) + ax_handle.add_patch( + plt.Rectangle( + xy=(i, 1 + 0.02 * chrbar_thickness), + width=1, + height=chrbar_thickness, + color=color, + lw=0, + transform=ax_handle.get_xaxis_transform(), + clip_on=False, + rasterized=rasterized, + ) + ) for c in np.unique(chr_ids.values): interval = np.where(chr_ids.values == c)[0] mid = np.percentile(interval, 45) - ax_handle.text(mid-10, 1 + 0.2*chrbar_thickness, str(c), transform=ax_handle.get_xaxis_transform()) + ax_handle.text( + mid - 10, + 1 + 0.2 * chrbar_thickness, + str(c), + transform=ax_handle.get_xaxis_transform(), + ) ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0) if remove_xticks: ax_handle.set_xticks([]) if add_legend: - a00 = plt.arrow(0,0, 0,0, color='darkblue') - a10 = plt.arrow(0,0, 0,0, color='lightblue') - a11 = plt.arrow(0,0, 0,0, color='lightgray') - a20 = plt.arrow(0,0, 0,0, color='dimgray') - a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow') - a30 = plt.arrow(0,0, 0,0, color='gold') - a22 = plt.arrow(0,0, 0,0, color='navajowhite') - a31 = plt.arrow(0,0, 0,0, color='orange') - a40 = plt.arrow(0,0, 0,0, color='darkorange') - a32 = plt.arrow(0,0, 0,0, color='salmon') - a41 = plt.arrow(0,0, 0,0, color='red') - a50 = plt.arrow(0,0, 0,0, color='darkred') - a33 = plt.arrow(0,0, 0,0, color='plum') - a42 = plt.arrow(0,0, 0,0, color='orchid') - a51 = plt.arrow(0,0, 0,0, color='purple') - a60 = plt.arrow(0,0, 0,0, color='indigo') - ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \ - ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \ - '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1)) + a00 = plt.arrow(0, 0, 0, 0, color="darkblue") + a10 = plt.arrow(0, 0, 0, 0, color="lightblue") + a11 = plt.arrow(0, 0, 0, 0, color="lightgray") + a20 = plt.arrow(0, 0, 0, 0, color="dimgray") + a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow") + a30 = plt.arrow(0, 0, 0, 0, color="gold") + a22 = plt.arrow(0, 0, 0, 0, color="navajowhite") + a31 = plt.arrow(0, 0, 0, 0, color="orange") + a40 = plt.arrow(0, 0, 0, 0, color="darkorange") + a32 = plt.arrow(0, 0, 0, 0, color="salmon") + a41 = plt.arrow(0, 0, 0, 0, color="red") + a50 = plt.arrow(0, 0, 0, 0, color="darkred") + a33 = plt.arrow(0, 0, 0, 0, color="plum") + a42 = plt.arrow(0, 0, 0, 0, color="orchid") + a51 = plt.arrow(0, 0, 0, 0, color="purple") + a60 = plt.arrow(0, 0, 0, 0, color="indigo") + ax_handle.legend( + [ + a00, + a10, + a11, + a20, + a21, + a30, + a22, + a31, + a40, + a32, + a41, + a50, + a33, + a42, + a51, + a60, + ], + [ + "(0, 0)", + "(1, 0)", + "(1, 1)", + "(2, 0)", + "(2, 1)", + "(3, 0)", + "(2, 2)", + "(3, 1)", + "(4, 0)", + "(3, 2)", + "(4, 1)", + "(5, 0)", + "(3, 3)", + "(4, 2)", + "(5, 1)", + "(6, 0)", + ], + ncol=2, + loc="upper left", + bbox_to_anchor=(1, 1), + ) return ax_handle -def plot_acn_from_df_anotherscheme(df_cnv, ax_handle, clone_ids=None, clone_names=None, clone_proportions=None, chrbar_pos=None, add_arrow=True, border_linewidth=1, chrbar_thickness=0.1, add_legend=True, remove_xticks=True, rasterized=True): +def plot_acn_from_df_anotherscheme( + df_cnv, + ax_handle, + clone_ids=None, + clone_names=None, + clone_proportions=None, + chrbar_pos=None, + add_arrow=True, + border_linewidth=1, + chrbar_thickness=0.1, + add_legend=True, + remove_xticks=True, + rasterized=True, +): # full color palette - palette,_ = get_full_palette() + palette, _ = get_full_palette() # read CN profiles - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) print(final_clone_ids) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) found = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) found += list(zip(major, minor)) found = list(set(found)) found.sort() # map CN to single digit number - map_cn = {x:i for i,x in enumerate(found)} + map_cn = {x: i for i, x in enumerate(found)} cnv_mapped = [] ploidy = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] ) - ploidy.append( np.mean(major + minor) ) - cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]) - ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids]) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))]) + ploidy.append(np.mean(major + minor)) + cnv_mapped = pd.DataFrame( + np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids] + ) + ploidy = pd.DataFrame( + np.around(np.array(ploidy), decimals=2).reshape(-1, 1), + index=[f"clone {cid}" for cid in final_clone_ids], + ) chr_ids = df_cnv.CHR colors = [palette[c] for c in found] if clone_ids is None: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids] - rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(final_clone_ids) + ], + ) if len(np.unique(rename_cnv_mapped.values)) == 1: colors = colors + colors - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=rasterized, + ax=ax_handle, + ) else: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids] if clone_names is None: - index_str = [f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)] + index_str = [ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ] else: - index_str = [f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)] + index_str = [ + f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ] if not clone_proportions is None: - index_str = [f"{index_str[c]}\nu={clone_proportions[c]:.2f}" for c in range(len(clone_ids))] - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=index_str) + index_str = [ + f"{index_str[c]}\nu={clone_proportions[c]:.2f}" + for c in range(len(clone_ids)) + ] + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=index_str, + ) if len(np.unique(rename_cnv_mapped.values)) == 1: colors = colors + colors - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=rasterized, ax=ax_handle) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=rasterized, + ax=ax_handle, + ) # indicate allele switches if add_arrow: if clone_ids is None: # find regions where there exist both clones with A > B and clones with A < B - has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0) - has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in final_clone_ids]), axis=0) - intervals, labs = get_intervals( (has_up & has_down) ) + has_up = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values + for cid in final_clone_ids + ] + ), + axis=0, + ) + has_down = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values + for cid in final_clone_ids + ] + ), + axis=0, + ) + intervals, labs = get_intervals((has_up & has_down)) # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow for i in range(len(intervals)): if not labs[i]: continue - for c,cid in enumerate(final_clone_ids): + for c, cid in enumerate(final_clone_ids): y1 = c - y2 = c+1 + y2 = c + 1 # up-arrow - y_diverge1 = 0.8*y2+0.2*y1 - y_diverge2 = 0.6*y2+0.4*y1 - y_merge = 0.7*y2+0.3*y1 - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + y_diverge1 = 0.8 * y2 + 0.2 * y1 + y_diverge2 = 0.6 * y2 + 0.4 * y1 + y_merge = 0.7 * y2 + 0.3 * y1 + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + > df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: # bounding box - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black", linewidth=border_linewidth) + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + linewidth=border_linewidth, + ) # arrow - ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_diverge1,y_merge], [y_diverge2,y_merge], color="black", edgecolor="black") + ax_handle.fill_between( + [ + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ], + [y_diverge1, y_merge], + [y_diverge2, y_merge], + color="black", + edgecolor="black", + ) # down-arrow - y_diverge1 = 0.2*y2+0.8*y1 - y_diverge2 = 0.4*y2+0.6*y1 - y_merge = 0.3*y2+0.7*y1 - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + y_diverge1 = 0.2 * y2 + 0.8 * y1 + y_diverge2 = 0.4 * y2 + 0.6 * y1 + y_merge = 0.3 * y2 + 0.7 * y1 + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + < df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: # bounding box - ax_handle.fill_between( np.arange(intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]), y1, y2, color="none", edgecolor="black", linewidth=border_linewidth) + ax_handle.fill_between( + np.arange( + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ), + y1, + y2, + color="none", + edgecolor="black", + linewidth=border_linewidth, + ) # arrow - ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_merge,y_diverge1], [y_merge,y_diverge2], color="black", edgecolor="black") + ax_handle.fill_between( + [ + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ], + [y_merge, y_diverge1], + [y_merge, y_diverge2], + color="black", + edgecolor="black", + ) else: # find regions where there exist both clones with A > B and clones with A < B - has_up = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0) - has_down = np.any(np.vstack([ df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values for cid in clone_ids]), axis=0) - intervals, labs = get_intervals( (has_up & has_down) ) + has_up = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values > df_cnv[f"clone{cid} B"].values + for cid in clone_ids + ] + ), + axis=0, + ) + has_down = np.any( + np.vstack( + [ + df_cnv[f"clone{cid} A"].values < df_cnv[f"clone{cid} B"].values + for cid in clone_ids + ] + ), + axis=0, + ) + intervals, labs = get_intervals((has_up & has_down)) # for each intervals, find the corresponding clones with A > B to plot up-arrow, and corresponding clones with A < B to plot down-arrow for i in range(len(intervals)): if not labs[i]: continue - for c,cid in enumerate(clone_ids): + for c, cid in enumerate(clone_ids): y1 = c - y2 = c+1 + y2 = c + 1 # up-arrow - y_diverge1 = 0.8*y2+0.2*y1 - y_diverge2 = 0.6*y2+0.4*y1 - y_merge = 0.7*y2+0.3*y1 - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] > df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + y_diverge1 = 0.8 * y2 + 0.2 * y1 + y_diverge2 = 0.6 * y2 + 0.4 * y1 + y_merge = 0.7 * y2 + 0.3 * y1 + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + > df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: # bounding box - ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], y1, y2, color="none", edgecolor="black", linewidth=border_linewidth) + ax_handle.fill_between( + [ + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ], + y1, + y2, + color="none", + edgecolor="black", + linewidth=border_linewidth, + ) # arrow - ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_diverge1,y_merge], [y_diverge2,y_merge], color="black", edgecolor="black") + ax_handle.fill_between( + [ + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ], + [y_diverge1, y_merge], + [y_diverge2, y_merge], + color="black", + edgecolor="black", + ) # down-arrow - y_diverge1 = 0.2*y2+0.8*y1 - y_diverge2 = 0.4*y2+0.6*y1 - y_merge = 0.3*y2+0.7*y1 - sub_intervals, sub_labs = get_intervals( df_cnv[f"clone{cid} A"].values[intervals[i][0]:intervals[i][1]] < df_cnv[f"clone{cid} B"].values[intervals[i][0]:intervals[i][1]] ) + y_diverge1 = 0.2 * y2 + 0.8 * y1 + y_diverge2 = 0.4 * y2 + 0.6 * y1 + y_merge = 0.3 * y2 + 0.7 * y1 + sub_intervals, sub_labs = get_intervals( + df_cnv[f"clone{cid} A"].values[ + intervals[i][0] : intervals[i][1] + ] + < df_cnv[f"clone{cid} B"].values[ + intervals[i][0] : intervals[i][1] + ] + ) for j, sub_int in enumerate(sub_intervals): if sub_labs[j]: # bounding box - ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], y1, y2, color="none", edgecolor="black", linewidth=border_linewidth) + ax_handle.fill_between( + [ + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ], + y1, + y2, + color="none", + edgecolor="black", + linewidth=border_linewidth, + ) # arrow - ax_handle.fill_between( [intervals[i][0]+sub_int[0], intervals[i][0]+sub_int[1]], [y_merge,y_diverge1], [y_merge,y_diverge2], color="black", edgecolor="black") + ax_handle.fill_between( + [ + intervals[i][0] + sub_int[0], + intervals[i][0] + sub_int[1], + ], + [y_merge, y_diverge1], + [y_merge, y_diverge2], + color="black", + edgecolor="black", + ) # # horizontal separation between clones # for c,cid in enumerate(clone_ids[:-1]): @@ -435,224 +1123,506 @@ def plot_acn_from_df_anotherscheme(df_cnv, ax_handle, clone_ids=None, clone_name h = len(final_clone_ids) if clone_ids is None else len(clone_ids) # ax_handle.add_patch(plt.Rectangle(xy=(0, h + chrbar_thickness), width=df_cnv.shape[0], height=chrbar_thickness, color='white', lw=0, transform=ax_handle.transData, clip_on=False, rasterized=rasterized)) - for i,c in enumerate(np.unique(chr_ids.values)): + for i, c in enumerate(np.unique(chr_ids.values)): interval = np.where(chr_ids.values == c)[0] # add vertical separation between chromosomes if not np.max(interval) + 1 >= df_cnv.shape[0]: - ax_handle.axvline(x=np.max(interval), color='black', lw=0.5, ymin=-0.5/(h+1), clip_on = False) + ax_handle.axvline( + x=np.max(interval), + color="black", + lw=0.5, + ymin=-0.5 / (h + 1), + clip_on=False, + ) mid = np.percentile(interval, 45) if i % 2 == 0: - ax_handle.text(mid, h + chrbar_thickness, str(c), ha='center', transform=ax_handle.transData) + ax_handle.text( + mid, + h + chrbar_thickness, + str(c), + ha="center", + transform=ax_handle.transData, + ) else: - ax_handle.text(mid, h + 2*chrbar_thickness, str(c), ha='center', transform=ax_handle.transData) + ax_handle.text( + mid, + h + 2 * chrbar_thickness, + str(c), + ha="center", + transform=ax_handle.transData, + ) elif chrbar_pos == "top": chr_ids = df_cnv.CHR h = len(final_clone_ids) if clone_ids is None else len(clone_ids) # ax_handle.add_patch(plt.Rectangle(xy=(0, h + chrbar_thickness), width=df_cnv.shape[0], height=chrbar_thickness, color='white', lw=0, transform=ax_handle.transData, clip_on=False, rasterized=rasterized)) - for i,c in enumerate(np.unique(chr_ids.values)): + for i, c in enumerate(np.unique(chr_ids.values)): interval = np.where(chr_ids.values == c)[0] # add vertical separation between chromosomes if not np.max(interval) + 1 >= df_cnv.shape[0]: - ax_handle.axvline(x=np.max(interval), color='black', lw=0.5, ymax=1+0.5/(h+1), clip_on = False) + ax_handle.axvline( + x=np.max(interval), + color="black", + lw=0.5, + ymax=1 + 0.5 / (h + 1), + clip_on=False, + ) mid = np.percentile(interval, 45) if i % 2 == 0: - ax_handle.text(mid, -0.1*chrbar_thickness, str(c), ha='center', transform=ax_handle.transData) + ax_handle.text( + mid, + -0.1 * chrbar_thickness, + str(c), + ha="center", + transform=ax_handle.transData, + ) else: - ax_handle.text(mid, -0.8*chrbar_thickness, str(c), ha='center', transform=ax_handle.transData) + ax_handle.text( + mid, + -0.8 * chrbar_thickness, + str(c), + ha="center", + transform=ax_handle.transData, + ) ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0) if remove_xticks: ax_handle.set_xticks([]) if add_legend: - a00 = plt.arrow(0,0, 0,0, color='darkblue') - a10 = plt.arrow(0,0, 0,0, color='lightblue') - a11 = plt.arrow(0,0, 0,0, color='lightgray') - a20 = plt.arrow(0,0, 0,0, color='dimgray') - a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow') - a30 = plt.arrow(0,0, 0,0, color='gold') - a22 = plt.arrow(0,0, 0,0, color='navajowhite') - a31 = plt.arrow(0,0, 0,0, color='orange') - a40 = plt.arrow(0,0, 0,0, color='darkorange') - a32 = plt.arrow(0,0, 0,0, color='salmon') - a41 = plt.arrow(0,0, 0,0, color='red') - a50 = plt.arrow(0,0, 0,0, color='darkred') - a33 = plt.arrow(0,0, 0,0, color='plum') - a42 = plt.arrow(0,0, 0,0, color='orchid') - a51 = plt.arrow(0,0, 0,0, color='purple') - a60 = plt.arrow(0,0, 0,0, color='indigo') - ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \ - ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \ - '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1)) + a00 = plt.arrow(0, 0, 0, 0, color="darkblue") + a10 = plt.arrow(0, 0, 0, 0, color="lightblue") + a11 = plt.arrow(0, 0, 0, 0, color="lightgray") + a20 = plt.arrow(0, 0, 0, 0, color="dimgray") + a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow") + a30 = plt.arrow(0, 0, 0, 0, color="gold") + a22 = plt.arrow(0, 0, 0, 0, color="navajowhite") + a31 = plt.arrow(0, 0, 0, 0, color="orange") + a40 = plt.arrow(0, 0, 0, 0, color="darkorange") + a32 = plt.arrow(0, 0, 0, 0, color="salmon") + a41 = plt.arrow(0, 0, 0, 0, color="red") + a50 = plt.arrow(0, 0, 0, 0, color="darkred") + a33 = plt.arrow(0, 0, 0, 0, color="plum") + a42 = plt.arrow(0, 0, 0, 0, color="orchid") + a51 = plt.arrow(0, 0, 0, 0, color="purple") + a60 = plt.arrow(0, 0, 0, 0, color="indigo") + ax_handle.legend( + [ + a00, + a10, + a11, + a20, + a21, + a30, + a22, + a31, + a40, + a32, + a41, + a50, + a33, + a42, + a51, + a60, + ], + [ + "(0, 0)", + "(1, 0)", + "(1, 1)", + "(2, 0)", + "(2, 1)", + "(3, 0)", + "(2, 2)", + "(3, 1)", + "(4, 0)", + "(3, 2)", + "(4, 1)", + "(5, 0)", + "(3, 3)", + "(4, 2)", + "(5, 1)", + "(6, 0)", + ], + ncol=2, + loc="upper left", + bbox_to_anchor=(1, 1), + ) return ax_handle - def plot_acn_legend(fig, shift_y=0.3): # full palette palette, ordered_acn = get_full_palette() - map_cn = {x:i for i,x in enumerate(ordered_acn)} + map_cn = {x: i for i, x in enumerate(ordered_acn)} colors = [palette[c] for c in ordered_acn] - cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)) + cmap = LinearSegmentedColormap.from_list("multi-level", colors, len(colors)) - n_total_cn = np.max([x[0]+x[1] for x in ordered_acn]) + 1 - gs = GridSpec(2*n_total_cn-1, 1, figure=fig) + n_total_cn = np.max([x[0] + x[1] for x in ordered_acn]) + 1 + gs = GridSpec(2 * n_total_cn - 1, 1, figure=fig) # total cn = 0 - ax = fig.add_subplot(gs[2*n_total_cn-2, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(0,0)]]).reshape((1,-1)), columns=["{0,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 2, :]) + seaborn.heatmap( + pd.DataFrame(np.array([map_cn[(0, 0)]]).reshape((1, -1)), columns=["{0,0}"]), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y)) # total cn = 1 - ax = fig.add_subplot(gs[2*n_total_cn-4, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(1,0)]]).reshape((1,-1)), columns=["{1,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 4, :]) + seaborn.heatmap( + pd.DataFrame(np.array([map_cn[(1, 0)]]).reshape((1, -1)), columns=["{1,0}"]), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y)) # total cn = 2 - ax = fig.add_subplot(gs[2*n_total_cn-6, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(1,1)], map_cn[(2,0)]]).reshape((1,-1)), columns=["{1,1}", "{2,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 6, :]) + seaborn.heatmap( + pd.DataFrame( + np.array([map_cn[(1, 1)], map_cn[(2, 0)]]).reshape((1, -1)), + columns=["{1,1}", "{2,0}"], + ), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,0.3)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, 0.3)) # total cn = 3 - ax = fig.add_subplot(gs[2*n_total_cn-8, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(2,1)], map_cn[(3,0)]]).reshape((1,-1)), columns=["{2,1}", "{3,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 8, :]) + seaborn.heatmap( + pd.DataFrame( + np.array([map_cn[(2, 1)], map_cn[(3, 0)]]).reshape((1, -1)), + columns=["{2,1}", "{3,0}"], + ), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y)) # total cn = 4 - ax = fig.add_subplot(gs[2*n_total_cn-10, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(2,2)], map_cn[(3,1)], map_cn[(4,0)]]).reshape((1,-1)), columns=["{2,2}", "{3,1}", "{4,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 10, :]) + seaborn.heatmap( + pd.DataFrame( + np.array([map_cn[(2, 2)], map_cn[(3, 1)], map_cn[(4, 0)]]).reshape((1, -1)), + columns=["{2,2}", "{3,1}", "{4,0}"], + ), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y)) # total cn = 5 - ax = fig.add_subplot(gs[2*n_total_cn-12, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(3,2)], map_cn[(4,1)], map_cn[(5,0)]]).reshape((1,-1)), columns=["{3,2}", "{4,1}", "{5,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 12, :]) + seaborn.heatmap( + pd.DataFrame( + np.array([map_cn[(3, 2)], map_cn[(4, 1)], map_cn[(5, 0)]]).reshape((1, -1)), + columns=["{3,2}", "{4,1}", "{5,0}"], + ), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y)) # total cn = 6 - ax = fig.add_subplot(gs[2*n_total_cn-14, :]) - seaborn.heatmap( pd.DataFrame(np.array([map_cn[(3,3)], map_cn[(4,2)], map_cn[(5,1)], map_cn[(6,0)]]).reshape((1,-1)), columns=["{3,3}", "{4,2}", "{5,1}", "{6,0}"]), vmin=0, vmax=len(colors), cmap=cmap, cbar=False, linewidths=1, linecolor="black" ) + ax = fig.add_subplot(gs[2 * n_total_cn - 14, :]) + seaborn.heatmap( + pd.DataFrame( + np.array( + [map_cn[(3, 3)], map_cn[(4, 2)], map_cn[(5, 1)], map_cn[(6, 0)]] + ).reshape((1, -1)), + columns=["{3,3}", "{4,2}", "{5,1}", "{6,0}"], + ), + vmin=0, + vmax=len(colors), + cmap=cmap, + cbar=False, + linewidths=1, + linecolor="black", + ) ax.set_yticks([]) - ax.set_xticklabels(ax.get_xticklabels(), position=(0,shift_y)) + ax.set_xticklabels(ax.get_xticklabels(), position=(0, shift_y)) return fig -def plot_acn_withhighlight(cn_file, df_highlight_events, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True): +def plot_acn_withhighlight( + cn_file, + df_highlight_events, + ax_handle, + clone_ids=None, + clone_names=None, + add_chrbar=True, + chrbar_thickness=0.1, + add_legend=True, + remove_xticks=True, +): """ df_highlight_events: dataframe with columns: ["BinSTART", "BinEND", "involved_clones"] """ # full color palette - palette,_ = get_full_palette() + palette, _ = get_full_palette() # read CN profiles df_cnv = pd.read_csv(cn_file, header=0, sep="\t") - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) print(final_clone_ids) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) found = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) found += list(zip(major, minor)) found = list(set(found)) found.sort() # map CN to single digit number - map_cn = {x:i for i,x in enumerate(found)} + map_cn = {x: i for i, x in enumerate(found)} cnv_mapped = [] ploidy = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - cnv_mapped.append( [map_cn[(major[i], minor[i])] for i in range(len(major))] ) - ploidy.append( np.mean(major + minor) ) - cnv_mapped = pd.DataFrame( np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids]) - ploidy = pd.DataFrame(np.around(np.array(ploidy), decimals=2).reshape(-1,1), index=[f"clone {cid}" for cid in final_clone_ids]) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + cnv_mapped.append([map_cn[(major[i], minor[i])] for i in range(len(major))]) + ploidy.append(np.mean(major + minor)) + cnv_mapped = pd.DataFrame( + np.array(cnv_mapped), index=[f"clone {cid}" for cid in final_clone_ids] + ) + ploidy = pd.DataFrame( + np.around(np.array(ploidy), decimals=2).reshape(-1, 1), + index=[f"clone {cid}" for cid in final_clone_ids], + ) chr_ids = df_cnv.CHR colors = [palette[c] for c in found] if clone_ids is None: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in final_clone_ids] - rename_cnv_mapped = pd.DataFrame(cnv_mapped.values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(final_clone_ids)]) - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(final_clone_ids) + ], + ) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) else: tmp_ploidy = [ploidy.loc[f"clone {cid}"].values[0] for cid in clone_ids] if clone_names is None: - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[ + f"clone {cid}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ], + ) else: - rename_cnv_mapped = pd.DataFrame(cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" for c,cid in enumerate(clone_ids)]) - seaborn.heatmap(rename_cnv_mapped, cmap=LinearSegmentedColormap.from_list('multi-level', colors, len(colors)), linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + rename_cnv_mapped = pd.DataFrame( + cnv_mapped.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[ + f"{clone_names[c]}\nploidy {tmp_ploidy[c]}" + for c, cid in enumerate(clone_ids) + ], + ) + seaborn.heatmap( + rename_cnv_mapped, + cmap=LinearSegmentedColormap.from_list("multi-level", colors, len(colors)), + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) for i in range(df_highlight_events.shape[0]): involved_clones = df_highlight_events.involved_clones.values[i] # interval start and end - interval = [df_highlight_events.BinSTART.values[i], df_highlight_events.BinEND.values[i]] + interval = [ + df_highlight_events.BinSTART.values[i], + df_highlight_events.BinEND.values[i], + ] if clone_ids is None: for c, cid in enumerate(final_clone_ids): if not cid in involved_clones: continue y1 = c - y2 = c+1 - ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2) + y2 = c + 1 + ax_handle.fill_between( + np.arange(interval[0], interval[1]), + y1, + y2, + color="none", + edgecolor="black", + linewidth=2, + ) else: for c, cid in enumerate(clone_ids): if not cid in involved_clones: continue y1 = c - y2 = c+1 - ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2) - + y2 = c + 1 + ax_handle.fill_between( + np.arange(interval[0], interval[1]), + y1, + y2, + color="none", + edgecolor="black", + linewidth=2, + ) + if add_chrbar: # add chr color - chr_palette = cycle(['#525252', '#969696', '#cccccc']) - lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)} + chr_palette = cycle(["#525252", "#969696", "#cccccc"]) + lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)} col_colors = chr_ids.map(lut) for i, color in enumerate(col_colors): - ax_handle.add_patch(plt.Rectangle(xy=(i, 1.01), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True)) + ax_handle.add_patch( + plt.Rectangle( + xy=(i, 1.01), + width=1, + height=chrbar_thickness, + color=color, + lw=0, + transform=ax_handle.get_xaxis_transform(), + clip_on=False, + rasterized=True, + ) + ) for c in np.unique(chr_ids.values): interval = np.where(chr_ids.values == c)[0] mid = np.percentile(interval, 45) - ax_handle.text(mid-10, 1.04, str(c), transform=ax_handle.get_xaxis_transform()) + ax_handle.text( + mid - 10, 1.04, str(c), transform=ax_handle.get_xaxis_transform() + ) ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0) if remove_xticks: ax_handle.set_xticks([]) if add_legend: - a00 = plt.arrow(0,0, 0,0, - color='darkblue') - a10 = plt.arrow(0,0, 0,0, color='lightblue') - a11 = plt.arrow(0,0, 0,0, color='lightgray') - a20 = plt.arrow(0,0, 0,0, color='dimgray') - a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow') - a30 = plt.arrow(0,0, 0,0, color='gold') - a22 = plt.arrow(0,0, 0,0, color='navajowhite') - a31 = plt.arrow(0,0, 0,0, color='orange') - a40 = plt.arrow(0,0, 0,0, color='darkorange') - a32 = plt.arrow(0,0, 0,0, color='salmon') - a41 = plt.arrow(0,0, 0,0, color='red') - a50 = plt.arrow(0,0, 0,0, color='darkred') - a33 = plt.arrow(0,0, 0,0, color='plum') - a42 = plt.arrow(0,0, 0,0, color='orchid') - a51 = plt.arrow(0,0, 0,0, color='purple') - a60 = plt.arrow(0,0, 0,0, color='indigo') - ax_handle.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \ - ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \ - '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1 - 0.1 * min(0, rename_cnv_mapped.shape[0]-6))) + a00 = plt.arrow(0, 0, 0, 0, color="darkblue") + a10 = plt.arrow(0, 0, 0, 0, color="lightblue") + a11 = plt.arrow(0, 0, 0, 0, color="lightgray") + a20 = plt.arrow(0, 0, 0, 0, color="dimgray") + a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow") + a30 = plt.arrow(0, 0, 0, 0, color="gold") + a22 = plt.arrow(0, 0, 0, 0, color="navajowhite") + a31 = plt.arrow(0, 0, 0, 0, color="orange") + a40 = plt.arrow(0, 0, 0, 0, color="darkorange") + a32 = plt.arrow(0, 0, 0, 0, color="salmon") + a41 = plt.arrow(0, 0, 0, 0, color="red") + a50 = plt.arrow(0, 0, 0, 0, color="darkred") + a33 = plt.arrow(0, 0, 0, 0, color="plum") + a42 = plt.arrow(0, 0, 0, 0, color="orchid") + a51 = plt.arrow(0, 0, 0, 0, color="purple") + a60 = plt.arrow(0, 0, 0, 0, color="indigo") + ax_handle.legend( + [ + a00, + a10, + a11, + a20, + a21, + a30, + a22, + a31, + a40, + a32, + a41, + a50, + a33, + a42, + a51, + a60, + ], + [ + "(0, 0)", + "(1, 0)", + "(1, 1)", + "(2, 0)", + "(2, 1)", + "(3, 0)", + "(2, 2)", + "(3, 1)", + "(4, 0)", + "(3, 2)", + "(4, 1)", + "(5, 0)", + "(3, 3)", + "(4, 2)", + "(5, 1)", + "(6, 0)", + ], + ncol=2, + loc="upper left", + bbox_to_anchor=(1, 1 - 0.1 * min(0, rename_cnv_mapped.shape[0] - 6)), + ) return ax_handle -def plot_total_cn(df_cnv, ax_handle, df_highlight_events=None, palette_mode=6, clone_ids=None, clone_names=None, add_chrbar=True, chrbar_thickness=0.1, add_legend=True, legend_position="upper left", remove_xticks=True): +def plot_total_cn( + df_cnv, + ax_handle, + df_highlight_events=None, + palette_mode=6, + clone_ids=None, + clone_names=None, + add_chrbar=True, + chrbar_thickness=0.1, + add_legend=True, + legend_position="upper left", + remove_xticks=True, +): """ df_cnv : pandas.DataFrame Each row is a genomic bin, containing columns "CHR", "clone {cid}" for each clone id. @@ -663,60 +1633,127 @@ def plot_total_cn(df_cnv, ax_handle, df_highlight_events=None, palette_mode=6, c # create a cmap that map "amp" to #B44F3D, "bamp" to #E18073, "bdel" to #A0CEEA, "del" to #4F69DF, "loh" to #738B2D if palette_mode == 6: - full_palette = {"amp":"#B44F3D", "bamp":"#E18073", "bdel":"#A0CEEA", "del":"#4F69DF", "loh":"#738B2D", "neu":"lightgrey"} + full_palette = { + "amp": "#B44F3D", + "bamp": "#E18073", + "bdel": "#A0CEEA", + "del": "#4F69DF", + "loh": "#738B2D", + "neu": "lightgrey", + } else: - full_palette = {"amp":"#B44F3D", "del":"#4F69DF", "neu":"lightgrey"} + full_palette = {"amp": "#B44F3D", "del": "#4F69DF", "neu": "lightgrey"} if clone_ids is None: - found = np.unique(df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].values.flatten()) - lut = {x:i for i,x in enumerate(found)} + found = np.unique( + df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].values.flatten() + ) + lut = {x: i for i, x in enumerate(found)} palette = matplotlib.colors.ListedColormap([full_palette[x] for x in found]) - df_cnv_mapped = df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].replace(lut) + df_cnv_mapped = df_cnv.iloc[:, df_cnv.columns.str.startswith("clone")].replace( + lut + ) df_cnv_mapped = df_cnv_mapped.T - seaborn.heatmap(df_cnv_mapped, cmap=palette, linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + seaborn.heatmap( + df_cnv_mapped, + cmap=palette, + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) else: - found = np.unique(df_cnv[[f"clone {cid}" for cid in clone_ids]].values.flatten()) - lut = {x:i for i,x in enumerate(found)} + found = np.unique( + df_cnv[[f"clone {cid}" for cid in clone_ids]].values.flatten() + ) + lut = {x: i for i, x in enumerate(found)} palette = matplotlib.colors.ListedColormap([full_palette[x] for x in found]) df_cnv_mapped = df_cnv[[f"clone {cid}" for cid in clone_ids]].replace(lut) df_cnv_mapped = df_cnv_mapped.T if not clone_names is None: - df_cnv_mapped.rename(index={f"clone {cid}":clone_names[i] for i,cid in enumerate(clone_ids)}, inplace=True) - seaborn.heatmap(df_cnv_mapped, cmap=palette, linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + df_cnv_mapped.rename( + index={ + f"clone {cid}": clone_names[i] for i, cid in enumerate(clone_ids) + }, + inplace=True, + ) + seaborn.heatmap( + df_cnv_mapped, + cmap=palette, + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) if not df_highlight_events is None: - final_clone_ids = [x.split(" ")[1] for x in df_cnv.columns if x.startswith("clone")] + final_clone_ids = [ + x.split(" ")[1] for x in df_cnv.columns if x.startswith("clone") + ] for i in range(df_highlight_events.shape[0]): involved_clones = df_highlight_events.involved_clones.values[i] # interval start and end - interval = [df_highlight_events.BinSTART.values[i], df_highlight_events.BinEND.values[i]] + interval = [ + df_highlight_events.BinSTART.values[i], + df_highlight_events.BinEND.values[i], + ] if clone_ids is None: for c, cid in enumerate(final_clone_ids): if not cid in involved_clones: continue y1 = c - y2 = c+1 - ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2) + y2 = c + 1 + ax_handle.fill_between( + np.arange(interval[0], interval[1]), + y1, + y2, + color="none", + edgecolor="black", + linewidth=2, + ) else: for c, cid in enumerate(clone_ids): if not cid in involved_clones: continue y1 = c - y2 = c+1 - ax_handle.fill_between( np.arange(interval[0], interval[1]), y1, y2, color="none", edgecolor="black", linewidth=2) + y2 = c + 1 + ax_handle.fill_between( + np.arange(interval[0], interval[1]), + y1, + y2, + color="none", + edgecolor="black", + linewidth=2, + ) if add_chrbar: # add chr color - chr_palette = cycle(['#525252', '#969696', '#cccccc']) - lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)} + chr_palette = cycle(["#525252", "#969696", "#cccccc"]) + lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)} col_colors = chr_ids.map(lut) for i, color in enumerate(col_colors): - ax_handle.add_patch(plt.Rectangle(xy=(i, 1 + 0.02*chrbar_thickness), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True)) + ax_handle.add_patch( + plt.Rectangle( + xy=(i, 1 + 0.02 * chrbar_thickness), + width=1, + height=chrbar_thickness, + color=color, + lw=0, + transform=ax_handle.get_xaxis_transform(), + clip_on=False, + rasterized=True, + ) + ) for c in np.unique(chr_ids.values): interval = np.where(chr_ids.values == c)[0] mid = np.percentile(interval, 45) - ax_handle.text(mid-10, 1 + 0.2*chrbar_thickness, str(c), transform=ax_handle.get_xaxis_transform()) + ax_handle.text( + mid - 10, + 1 + 0.2 * chrbar_thickness, + str(c), + transform=ax_handle.get_xaxis_transform(), + ) ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0) if remove_xticks: @@ -724,73 +1761,150 @@ def plot_total_cn(df_cnv, ax_handle, df_highlight_events=None, palette_mode=6, c if add_legend: if palette_mode == 6: - a0 = plt.arrow(0,0, 0,0, color='#B44F3D') - a1 = plt.arrow(0,0, 0,0, color='#E18073') - a2 = plt.arrow(0,0, 0,0, color='lightgrey') - a3 = plt.arrow(0,0, 0,0, color='#A0CEEA') - a4 = plt.arrow(0,0, 0,0, color='#4F69DF') - a5 = plt.arrow(0,0, 0,0, color='#738B2D') + a0 = plt.arrow(0, 0, 0, 0, color="#B44F3D") + a1 = plt.arrow(0, 0, 0, 0, color="#E18073") + a2 = plt.arrow(0, 0, 0, 0, color="lightgrey") + a3 = plt.arrow(0, 0, 0, 0, color="#A0CEEA") + a4 = plt.arrow(0, 0, 0, 0, color="#4F69DF") + a5 = plt.arrow(0, 0, 0, 0, color="#738B2D") if legend_position == "upper left": - ax_handle.legend([a0, a1, a2, a3, a4, a5], ["amp", "bamp", "neu", "bdel", "del", "loh"], loc='upper left', bbox_to_anchor=(1,1 - 0.1 * min(0, df_cnv_mapped.shape[0]-5))) + ax_handle.legend( + [a0, a1, a2, a3, a4, a5], + ["amp", "bamp", "neu", "bdel", "del", "loh"], + loc="upper left", + bbox_to_anchor=(1, 1 - 0.1 * min(0, df_cnv_mapped.shape[0] - 5)), + ) else: - ax_handle.legend([a0, a1, a2, a3, a4, a5], ["amp", "bamp", "neu", "bdel", "del", "loh"], loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=6) + ax_handle.legend( + [a0, a1, a2, a3, a4, a5], + ["amp", "bamp", "neu", "bdel", "del", "loh"], + loc="lower center", + bbox_to_anchor=(0.5, -0.25), + ncol=6, + ) else: - a0 = plt.arrow(0,0, 0,0, color='#B44F3D') - a1 = plt.arrow(0,0, 0,0, color='lightgrey') - a2 = plt.arrow(0,0, 0,0, color='#4F69DF') + a0 = plt.arrow(0, 0, 0, 0, color="#B44F3D") + a1 = plt.arrow(0, 0, 0, 0, color="lightgrey") + a2 = plt.arrow(0, 0, 0, 0, color="#4F69DF") if legend_position == "upper left": - ax_handle.legend([a0, a1, a2], ["amp", "neu", "del"], loc='upper left', bbox_to_anchor=(1,1 - 0.1 * min(0, df_cnv_mapped.shape[0]-2))) + ax_handle.legend( + [a0, a1, a2], + ["amp", "neu", "del"], + loc="upper left", + bbox_to_anchor=(1, 1 - 0.1 * min(0, df_cnv_mapped.shape[0] - 2)), + ) else: - ax_handle.legend([a0, a1, a2], ["amp", "neu", "del"], loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=3) + ax_handle.legend( + [a0, a1, a2], + ["amp", "neu", "del"], + loc="lower center", + bbox_to_anchor=(0.5, -0.25), + ncol=3, + ) return ax_handle -def plot_amp_del(cn_file, ax_handle, clone_ids=None, clone_names=None, add_chrbar=True, chrbar_thickness=0.1, add_legend=True, remove_xticks=True): +def plot_amp_del( + cn_file, + ax_handle, + clone_ids=None, + clone_names=None, + add_chrbar=True, + chrbar_thickness=0.1, + add_legend=True, + remove_xticks=True, +): # define color palette that maps 0 to lightgrey, -2 and -1 to blues with increasing intensity, and 1 and 2 to reds with increasing intensity - palette_map = {-2+i:x for i,x in enumerate(seaborn.color_palette("coolwarm", 5).as_hex())} - + palette_map = { + -2 + i: x for i, x in enumerate(seaborn.color_palette("coolwarm", 5).as_hex()) + } + # read CN profiles df_cnv = pd.read_csv(cn_file, header=0, sep="\t") - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) # compute the relative copy number with respect to the median copy number per clone df_cnv_rel = [] for cid in final_clone_ids: - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) median_copy = np.median(major + minor) # clamp the relative copy number major + minor - median_copy to [-2,2] - df_cnv_rel.append( np.minimum(2, np.maximum(-2, major + minor - median_copy)) ) - df_cnv_rel = pd.DataFrame( np.array(df_cnv_rel), index=[f"clone {cid}" for cid in final_clone_ids]) + df_cnv_rel.append(np.minimum(2, np.maximum(-2, major + minor - median_copy))) + df_cnv_rel = pd.DataFrame( + np.array(df_cnv_rel), index=[f"clone {cid}" for cid in final_clone_ids] + ) # plot heatmap if clone_ids is None: - rename_cnv_mapped = pd.DataFrame(df_cnv_rel.values, index=[f"clone {cid}" for c,cid in enumerate(final_clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + df_cnv_rel.values, + index=[f"clone {cid}" for c, cid in enumerate(final_clone_ids)], + ) unique_cnv_values = np.unique(rename_cnv_mapped.values) - seaborn.heatmap(rename_cnv_mapped, cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]), linewidths=0, cbar=False, rasterized=True, ax=ax_handle) + seaborn.heatmap( + rename_cnv_mapped, + cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]), + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) else: if clone_names is None: - rename_cnv_mapped = pd.DataFrame(df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"clone {cid}" for c,cid in enumerate(clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[f"clone {cid}" for c, cid in enumerate(clone_ids)], + ) else: - rename_cnv_mapped = pd.DataFrame(df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values, index=[f"{clone_names[c]}" for c,cid in enumerate(clone_ids)]) + rename_cnv_mapped = pd.DataFrame( + df_cnv_rel.loc[[f"clone {cid}" for cid in clone_ids]].values, + index=[f"{clone_names[c]}" for c, cid in enumerate(clone_ids)], + ) unique_cnv_values = np.unique(rename_cnv_mapped.values) - seaborn.heatmap(rename_cnv_mapped, cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]), linewidths=0, cbar=False, rasterized=True, ax=ax_handle) - + seaborn.heatmap( + rename_cnv_mapped, + cmap=ListedColormap([palette_map[x] for x in unique_cnv_values]), + linewidths=0, + cbar=False, + rasterized=True, + ax=ax_handle, + ) + if add_chrbar: chr_ids = df_cnv.CHR # add chr color - chr_palette = cycle(['#525252', '#969696', '#cccccc']) - lut = {c:next(chr_palette) for c in np.unique(chr_ids.values)} + chr_palette = cycle(["#525252", "#969696", "#cccccc"]) + lut = {c: next(chr_palette) for c in np.unique(chr_ids.values)} col_colors = chr_ids.map(lut) for i, color in enumerate(col_colors): - ax_handle.add_patch(plt.Rectangle(xy=(i, 1.01), width=1, height=chrbar_thickness, color=color, lw=0, transform=ax_handle.get_xaxis_transform(), clip_on=False, rasterized=True)) + ax_handle.add_patch( + plt.Rectangle( + xy=(i, 1.01), + width=1, + height=chrbar_thickness, + color=color, + lw=0, + transform=ax_handle.get_xaxis_transform(), + clip_on=False, + rasterized=True, + ) + ) for c in np.unique(chr_ids.values): interval = np.where(chr_ids.values == c)[0] mid = np.percentile(interval, 45) - ax_handle.text(mid-10, 1.04, str(c), transform=ax_handle.get_xaxis_transform()) + ax_handle.text( + mid - 10, 1.04, str(c), transform=ax_handle.get_xaxis_transform() + ) ax_handle.set_yticklabels(ax_handle.get_yticklabels(), rotation=0) if remove_xticks: @@ -798,34 +1912,54 @@ def plot_amp_del(cn_file, ax_handle, clone_ids=None, clone_names=None, add_chrba # add legend corresponding to palette if add_legend: - a0 = plt.arrow(0,0, 0,0, color=palette_map[-2]) - a1 = plt.arrow(0,0, 0,0, color=palette_map[-1]) - a2 = plt.arrow(0,0, 0,0, color=palette_map[0]) - a3 = plt.arrow(0,0, 0,0, color=palette_map[1]) - a4 = plt.arrow(0,0, 0,0, color=palette_map[2]) - ax_handle.legend([a0, a1, a2, a3, a4], ['-2 and below','-1','0','1', '2 and above'], ncol=1, loc='upper left', bbox_to_anchor=(1,1)) + a0 = plt.arrow(0, 0, 0, 0, color=palette_map[-2]) + a1 = plt.arrow(0, 0, 0, 0, color=palette_map[-1]) + a2 = plt.arrow(0, 0, 0, 0, color=palette_map[0]) + a3 = plt.arrow(0, 0, 0, 0, color=palette_map[1]) + a4 = plt.arrow(0, 0, 0, 0, color=palette_map[2]) + ax_handle.legend( + [a0, a1, a2, a3, a4], + ["-2 and below", "-1", "0", "1", "2 and above"], + ncol=1, + loc="upper left", + bbox_to_anchor=(1, 1), + ) return ax_handle - -def plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, clone_names=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=15, linewidth=1, palette="chisel"): +def plot_rdr_baf( + configuration_file, + r_hmrf_initialization, + cn_file, + clone_ids=None, + clone_names=None, + remove_xticks=True, + rdr_ylim=5, + chrtext_shift=-0.3, + base_height=3.2, + pointsize=15, + linewidth=1, + palette="chisel", +): # full palette chisel_palette, ordered_acn = get_full_palette() - map_cn = {x:i for i,x in enumerate(ordered_acn)} + map_cn = {x: i for i, x in enumerate(ordered_acn)} colors = [chisel_palette[c] for c in ordered_acn] try: config = read_configuration_file(configuration_file) except: config = read_joint_configuration_file(configuration_file) - + # load allele specific integer copy numbers df_cnv = pd.read_csv(cn_file, header=0, sep="\t") - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) - if not '0' in final_clone_ids: - final_clone_ids = np.array(['0'] + list(final_clone_ids)) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) + if not "0" in final_clone_ids: + final_clone_ids = np.array(["0"] + list(final_clone_ids)) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) unique_chrs = np.unique(df_cnv.CHR.values) # load data @@ -836,144 +1970,362 @@ def plot_rdr_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=N single_base_nb_mean = dat["single_base_nb_mean"] single_total_bb_RD = dat["single_total_bb_RD"] single_tumor_prop = dat["single_tumor_prop"] - res_combine = dict( np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True) ) + res_combine = dict( + np.load( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) n_states = res_combine["new_p_binom"].shape[0] assert single_X.shape[0] == df_cnv.shape[0] - clone_index = [np.where(res_combine["new_assignment"] == c)[0] for c,cid in enumerate(final_clone_ids)] + clone_index = [ + np.where(res_combine["new_assignment"] == c)[0] + for c, cid in enumerate(final_clone_ids) + ] if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + ) n_obs = X.shape[0] nonempty_clones = np.where(np.sum(total_bb_RD, axis=0) > 0)[0] # plotting all clones if clone_ids is None: - fig, axes = plt.subplots(2*len(nonempty_clones), 1, figsize=(20, base_height*len(nonempty_clones)), dpi=200, facecolor="white") - for s,c in enumerate(nonempty_clones): + fig, axes = plt.subplots( + 2 * len(nonempty_clones), + 1, + figsize=(20, base_height * len(nonempty_clones)), + dpi=200, + facecolor="white", + ) + for s, c in enumerate(nonempty_clones): cid = final_clone_ids[c] # major and minor allele copies give the hue - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) # plot points - segments, labs = get_intervals(res_combine["pred_cnv"][:,c]) + segments, labs = get_intervals(res_combine["pred_cnv"][:, c]) if palette == "chisel": - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=1, legend=False, ax=axes[2*s]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 0, c] / base_nb_mean[:, c], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=1, + legend=False, + ax=axes[2 * s], + ) else: - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \ - hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \ - palette=palette, s=pointsize, edgecolor="black", linewidth=linewidth, alpha=1, legend=False, ax=axes[2*s]) - axes[2*s].set_ylabel(f"clone {cid}\nRDR") - axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1)) - axes[2*s].set_ylim([0,rdr_ylim]) - axes[2*s].set_xlim([0, n_obs]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 0, c] / base_nb_mean[:, c], + hue=pd.Categorical( + res_combine["pred_cnv"][:, c], + categories=np.arange(n_states), + ordered=True, + ), + palette=palette, + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=1, + legend=False, + ax=axes[2 * s], + ) + axes[2 * s].set_ylabel(f"clone {cid}\nRDR") + axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1)) + axes[2 * s].set_ylim([0, rdr_ylim]) + axes[2 * s].set_xlim([0, n_obs]) if remove_xticks: - axes[2*s].set_xticks([]) + axes[2 * s].set_xticks([]) if palette == "chisel": - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[2 * s + 1], + ) else: - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \ - palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1]) - axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF") - axes[2*s+1].set_ylim([-0.1, 1.1]) - axes[2*s+1].set_yticks([0, 0.5, 1]) - axes[2*s+1].set_xlim([0, n_obs]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + res_combine["pred_cnv"][:, c], + categories=np.arange(n_states), + ordered=True, + ), + palette=palette, + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[2 * s + 1], + ) + axes[2 * s + 1].set_ylabel(f"clone {cid}\nphased AF") + axes[2 * s + 1].set_ylim([-0.1, 1.1]) + axes[2 * s + 1].set_yticks([0, 0.5, 1]) + axes[2 * s + 1].set_xlim([0, n_obs]) if remove_xticks: - axes[2*s+1].set_xticks([]) + axes[2 * s + 1].set_xticks([]) for i, seg in enumerate(segments): - axes[2*s].plot(seg, [np.exp(res_combine["new_log_mu"][labs[i],c]), np.exp(res_combine["new_log_mu"][labs[i],c])], c="black", linewidth=2) - axes[2*s+1].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) - axes[2*s+1].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) + axes[2 * s].plot( + seg, + [ + np.exp(res_combine["new_log_mu"][labs[i], c]), + np.exp(res_combine["new_log_mu"][labs[i], c]), + ], + c="black", + linewidth=2, + ) + axes[2 * s + 1].plot( + seg, + [ + res_combine["new_p_binom"][labs[i], c], + res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) + axes[2 * s + 1].plot( + seg, + [ + 1 - res_combine["new_p_binom"][labs[i], c], + 1 - res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) for i in range(len(lengths)): - median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45 - axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform()) - for k in range(2*len(nonempty_clones)): + median_len = ( + np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45 + ) + axes[-1].text( + median_len - 5, + chrtext_shift, + unique_chrs[i], + transform=axes[-1].get_xaxis_transform(), + ) + for k in range(2 * len(nonempty_clones)): axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1) fig.tight_layout() # plot a given clone else: - fig, axes = plt.subplots(2*len(clone_ids), 1, figsize=(20, base_height*len(clone_ids)), dpi=200, facecolor="white") - for s,cid in enumerate(clone_ids): + fig, axes = plt.subplots( + 2 * len(clone_ids), + 1, + figsize=(20, base_height * len(clone_ids)), + dpi=200, + facecolor="white", + ) + for s, cid in enumerate(clone_ids): c = np.where(final_clone_ids == cid)[0][0] # major and minor allele copies give the hue - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) # plot points - segments, labs = get_intervals(res_combine["pred_cnv"][:,c]) + segments, labs = get_intervals(res_combine["pred_cnv"][:, c]) if palette == "chisel": - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 0, c] / base_nb_mean[:, c], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[2 * s], + ) else: - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,0,c]/base_nb_mean[:,c], \ - hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \ - palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s]) - axes[2*s].set_ylabel(f"clone {cid}\nRDR" if clone_names is None else f"clone {clone_names[s]}\nRDR") - axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1)) - axes[2*s].set_ylim([0,5]) - axes[2*s].set_xlim([0, n_obs]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 0, c] / base_nb_mean[:, c], + hue=pd.Categorical( + res_combine["pred_cnv"][:, c], + categories=np.arange(n_states), + ordered=True, + ), + palette=palette, + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[2 * s], + ) + axes[2 * s].set_ylabel( + f"clone {cid}\nRDR" + if clone_names is None + else f"clone {clone_names[s]}\nRDR" + ) + axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1)) + axes[2 * s].set_ylim([0, 5]) + axes[2 * s].set_xlim([0, n_obs]) if remove_xticks: - axes[2*s].set_xticks([]) + axes[2 * s].set_xticks([]) if palette == "chisel": - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[2 * s + 1], + ) else: - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \ - palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[2*s+1]) - axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF" if clone_names is None else f"clone {clone_names[s]}\nphased AF") - axes[2*s+1].set_ylim([-0.1, 1.1]) - axes[2*s+1].set_yticks([0, 0.5, 1]) - axes[2*s+1].set_xlim([0, n_obs]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + res_combine["pred_cnv"][:, c], + categories=np.arange(n_states), + ordered=True, + ), + palette=palette, + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[2 * s + 1], + ) + axes[2 * s + 1].set_ylabel( + f"clone {cid}\nphased AF" + if clone_names is None + else f"clone {clone_names[s]}\nphased AF" + ) + axes[2 * s + 1].set_ylim([-0.1, 1.1]) + axes[2 * s + 1].set_yticks([0, 0.5, 1]) + axes[2 * s + 1].set_xlim([0, n_obs]) if remove_xticks: - axes[2*s+1].set_xticks([]) + axes[2 * s + 1].set_xticks([]) for i, seg in enumerate(segments): - axes[2*s].plot(seg, [np.exp(res_combine["new_log_mu"][labs[i],c]), np.exp(res_combine["new_log_mu"][labs[i],c])], c="black", linewidth=2) - axes[2*s+1].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) - axes[2*s+1].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) - + axes[2 * s].plot( + seg, + [ + np.exp(res_combine["new_log_mu"][labs[i], c]), + np.exp(res_combine["new_log_mu"][labs[i], c]), + ], + c="black", + linewidth=2, + ) + axes[2 * s + 1].plot( + seg, + [ + res_combine["new_p_binom"][labs[i], c], + res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) + axes[2 * s + 1].plot( + seg, + [ + 1 - res_combine["new_p_binom"][labs[i], c], + 1 - res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) + for i in range(len(lengths)): - median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45 - axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform()) - for k in range(2*len(clone_ids)): + median_len = ( + np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45 + ) + axes[-1].text( + median_len - 5, + chrtext_shift, + unique_chrs[i], + transform=axes[-1].get_xaxis_transform(), + ) + for k in range(2 * len(clone_ids)): axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1) fig.tight_layout() return fig - -def plot_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, clone_names=None, remove_xticks=True, rdr_ylim=5, chrtext_shift=-0.3, base_height=3.2, pointsize=15, linewidth=1, palette="chisel"): +def plot_baf( + configuration_file, + r_hmrf_initialization, + cn_file, + clone_ids=None, + clone_names=None, + remove_xticks=True, + rdr_ylim=5, + chrtext_shift=-0.3, + base_height=3.2, + pointsize=15, + linewidth=1, + palette="chisel", +): # full palette chisel_palette, ordered_acn = get_full_palette() - map_cn = {x:i for i,x in enumerate(ordered_acn)} + map_cn = {x: i for i, x in enumerate(ordered_acn)} colors = [chisel_palette[c] for c in ordered_acn] try: config = read_configuration_file(configuration_file) except: config = read_joint_configuration_file(configuration_file) - + # load allele specific integer copy numbers df_cnv = pd.read_csv(cn_file, header=0, sep="\t") - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df_cnv.columns[3:] ]) - if not '0' in final_clone_ids: - final_clone_ids = np.array(['0'] + list(final_clone_ids)) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df_cnv.columns[3:]]) + if not "0" in final_clone_ids: + final_clone_ids = np.array(["0"] + list(final_clone_ids)) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) unique_chrs = np.unique(df_cnv.CHR.values) # load data @@ -984,40 +2336,90 @@ def plot_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, single_base_nb_mean = dat["single_base_nb_mean"] single_total_bb_RD = dat["single_total_bb_RD"] single_tumor_prop = dat["single_tumor_prop"] - res_combine = dict( np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True) ) + res_combine = dict( + np.load( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) n_states = res_combine["new_p_binom"].shape[0] assert single_X.shape[0] == df_cnv.shape[0] - clone_index = [np.where(res_combine["new_assignment"] == c)[0] for c,cid in enumerate(final_clone_ids)] + clone_index = [ + np.where(res_combine["new_assignment"] == c)[0] + for c, cid in enumerate(final_clone_ids) + ] if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + ) n_obs = X.shape[0] nonempty_clones = np.where(np.sum(total_bb_RD, axis=0) > 0)[0] # plotting all clones if clone_ids is None: - fig, axes = plt.subplots(len(nonempty_clones), 1, figsize=(20, base_height*len(nonempty_clones)), dpi=200, facecolor="white") - for s,c in enumerate(nonempty_clones): + fig, axes = plt.subplots( + len(nonempty_clones), + 1, + figsize=(20, base_height * len(nonempty_clones)), + dpi=200, + facecolor="white", + ) + for s, c in enumerate(nonempty_clones): cid = final_clone_ids[c] # major and minor allele copies give the hue - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) # plot points - segments, labs = get_intervals(res_combine["pred_cnv"][:,c]) + segments, labs = get_intervals(res_combine["pred_cnv"][:, c]) if palette == "chisel": - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[s], + ) else: - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \ - palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + res_combine["pred_cnv"][:, c], + categories=np.arange(n_states), + ordered=True, + ), + palette=palette, + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[s], + ) axes[s].set_ylabel(f"clone {cid}\nphased AF") axes[s].set_ylim([-0.1, 1.1]) axes[s].set_yticks([0, 0.5, 1]) @@ -1025,56 +2427,153 @@ def plot_baf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, if remove_xticks: axes[s].set_xticks([]) for i, seg in enumerate(segments): - axes[s].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) - axes[s].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) + axes[s].plot( + seg, + [ + res_combine["new_p_binom"][labs[i], c], + res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) + axes[s].plot( + seg, + [ + 1 - res_combine["new_p_binom"][labs[i], c], + 1 - res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) for i in range(len(lengths)): - median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45 - axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform()) + median_len = ( + np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45 + ) + axes[-1].text( + median_len - 5, + chrtext_shift, + unique_chrs[i], + transform=axes[-1].get_xaxis_transform(), + ) for k in range(len(nonempty_clones)): axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1) fig.tight_layout() # plot a given clone else: - fig, axes = plt.subplots(2*len(clone_ids), 1, figsize=(20, base_height*len(clone_ids)), dpi=200, facecolor="white") - for s,cid in enumerate(clone_ids): + fig, axes = plt.subplots( + 2 * len(clone_ids), + 1, + figsize=(20, base_height * len(clone_ids)), + dpi=200, + facecolor="white", + ) + for s, cid in enumerate(clone_ids): c = np.where(final_clone_ids == cid)[0][0] # major and minor allele copies give the hue - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) # plot points - segments, labs = get_intervals(res_combine["pred_cnv"][:,c]) + segments, labs = get_intervals(res_combine["pred_cnv"][:, c]) if palette == "chisel": - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s]) + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[s], + ) else: - seaborn.scatterplot(x=np.arange(X[:,1,c].shape[0]), y=X[:,1,c]/total_bb_RD[:,c], \ - hue=pd.Categorical(res_combine["pred_cnv"][:,c], categories=np.arange(n_states), ordered=True), \ - palette=palette, s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s]) - axes[s].set_ylabel(f"clone {cid}\nphased AF" if clone_names is None else f"clone {clone_names[s]}\nphased AF") + seaborn.scatterplot( + x=np.arange(X[:, 1, c].shape[0]), + y=X[:, 1, c] / total_bb_RD[:, c], + hue=pd.Categorical( + res_combine["pred_cnv"][:, c], + categories=np.arange(n_states), + ordered=True, + ), + palette=palette, + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[s], + ) + axes[s].set_ylabel( + f"clone {cid}\nphased AF" + if clone_names is None + else f"clone {clone_names[s]}\nphased AF" + ) axes[s].set_ylim([-0.1, 1.1]) axes[s].set_yticks([0, 0.5, 1]) axes[s].set_xlim([0, n_obs]) if remove_xticks: axes[s].set_xticks([]) for i, seg in enumerate(segments): - axes[s].plot(seg, [res_combine["new_p_binom"][labs[i],c], res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) - axes[s].plot(seg, [1-res_combine["new_p_binom"][labs[i],c], 1-res_combine["new_p_binom"][labs[i],c]], c="black", linewidth=2) - + axes[s].plot( + seg, + [ + res_combine["new_p_binom"][labs[i], c], + res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) + axes[s].plot( + seg, + [ + 1 - res_combine["new_p_binom"][labs[i], c], + 1 - res_combine["new_p_binom"][labs[i], c], + ], + c="black", + linewidth=2, + ) + for i in range(len(lengths)): - median_len = np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[:(i+1)]) * 0.45 - axes[-1].text(median_len-5, chrtext_shift, unique_chrs[i], transform=axes[-1].get_xaxis_transform()) - for k in range(2*len(clone_ids)): + median_len = ( + np.sum(lengths[:(i)]) * 0.55 + np.sum(lengths[: (i + 1)]) * 0.45 + ) + axes[-1].text( + median_len - 5, + chrtext_shift, + unique_chrs[i], + transform=axes[-1].get_xaxis_transform(), + ) + for k in range(2 * len(clone_ids)): axes[k].axvline(x=np.sum(lengths[:(i)]), c="grey", linewidth=1) fig.tight_layout() return fig -def plot_rdr_baf_from_df(df, clone_ids=None, clone_names=None, base_height=3.2, rdr_ylim=3, baf_ylim=0.5, baf_yticks=None, linewidth=0, pointsize=30, chrtext_shift=-0.3, add_legend=False, remove_xticks=True): +def plot_rdr_baf_from_df( + df, + clone_ids=None, + clone_names=None, + base_height=3.2, + rdr_ylim=3, + baf_ylim=0.5, + baf_yticks=None, + linewidth=0, + pointsize=30, + chrtext_shift=-0.3, + add_legend=False, + remove_xticks=True, +): """ Attributes ---------- @@ -1083,127 +2582,256 @@ def plot_rdr_baf_from_df(df, clone_ids=None, clone_names=None, base_height=3.2, """ # full palette chisel_palette, ordered_acn = get_full_palette() - map_cn = {x:i for i,x in enumerate(ordered_acn)} + map_cn = {x: i for i, x in enumerate(ordered_acn)} colors = [chisel_palette[c] for c in ordered_acn] - + # load allele specific integer copy numbers - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df.columns if "RD" in x ]) - assert (clone_ids is None) or np.all([ (cid in final_clone_ids) for cid in clone_ids]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df.columns if "RD" in x]) + assert (clone_ids is None) or np.all( + [(cid in final_clone_ids) for cid in clone_ids] + ) unique_chrs = np.unique(df.CHR.values) if clone_ids is None: - fig, axes = plt.subplots(2*len(final_clone_ids), 1, figsize=(20, base_height*len(final_clone_ids)), dpi=200, facecolor="white") - for s,cid in enumerate(final_clone_ids): + fig, axes = plt.subplots( + 2 * len(final_clone_ids), + 1, + figsize=(20, base_height * len(final_clone_ids)), + dpi=200, + facecolor="white", + ) + for s, cid in enumerate(final_clone_ids): # major and minor allele copies give the hue major = np.maximum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values) minor = np.minimum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values) - - seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f'clone{cid} RD'].values, \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s]) - axes[2*s].set_ylabel(f"clone {cid}\nRDR") - axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1)) - axes[2*s].set_ylim([0,rdr_ylim]) - axes[2*s].set_xlim([0, df.shape[0]]) + + seaborn.scatterplot( + x=np.arange(df.shape[0]), + y=df[f"clone{cid} RD"].values, + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=0.8, + legend=False, + ax=axes[2 * s], + ) + axes[2 * s].set_ylabel(f"clone {cid}\nRDR") + axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1)) + axes[2 * s].set_ylim([0, rdr_ylim]) + axes[2 * s].set_xlim([0, df.shape[0]]) if remove_xticks: - axes[2*s].set_xticks([]) - seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f"clone{cid} BAF"].values, \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s+1]) - axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF") - axes[2*s+1].set_ylim([-0.1, baf_ylim]) + axes[2 * s].set_xticks([]) + seaborn.scatterplot( + x=np.arange(df.shape[0]), + y=df[f"clone{cid} BAF"].values, + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=0.8, + legend=False, + ax=axes[2 * s + 1], + ) + axes[2 * s + 1].set_ylabel(f"clone {cid}\nphased AF") + axes[2 * s + 1].set_ylim([-0.1, baf_ylim]) if baf_yticks is None: - axes[2*s+1].set_yticks(np.arange(0, baf_ylim, 0.1)) + axes[2 * s + 1].set_yticks(np.arange(0, baf_ylim, 0.1)) else: - axes[2*s+1].set_yticks(baf_yticks) - axes[2*s+1].set_xlim([0, df.shape[0]]) + axes[2 * s + 1].set_yticks(baf_yticks) + axes[2 * s + 1].set_xlim([0, df.shape[0]]) if remove_xticks: - axes[2*s+1].set_xticks([]) + axes[2 * s + 1].set_xticks([]) for i in unique_chrs: median_len = np.percentile(np.where(df.CHR.values == i)[0], 45) max_len = np.max(np.where(df.CHR.values == i)[0]) - axes[-1].text(median_len-5, chrtext_shift, i, transform=axes[-1].get_xaxis_transform()) + axes[-1].text( + median_len - 5, + chrtext_shift, + i, + transform=axes[-1].get_xaxis_transform(), + ) if max_len + 1 < df.shape[0]: - for k in range(2*len(final_clone_ids)): + for k in range(2 * len(final_clone_ids)): axes[k].axvline(x=max_len, c="grey", linewidth=1) # plot a given clone else: - fig, axes = plt.subplots(2*len(clone_ids), 1, figsize=(20, base_height*len(clone_ids)), dpi=200, facecolor="white") - for s,cid in enumerate(clone_ids): + fig, axes = plt.subplots( + 2 * len(clone_ids), + 1, + figsize=(20, base_height * len(clone_ids)), + dpi=200, + facecolor="white", + ) + for s, cid in enumerate(clone_ids): # major and minor allele copies give the hue major = np.maximum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values) minor = np.minimum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values) # plot points - seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f'clone{cid} RD'].values, \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s]) - axes[2*s].set_ylabel(f"clone {cid}\nRDR" if clone_names is None else f"clone {clone_names[s]}\nRDR") - axes[2*s].set_yticks(np.arange(1, rdr_ylim, 1)) - axes[2*s].set_ylim([0,rdr_ylim]) - axes[2*s].set_xlim([0, df.shape[0]]) + seaborn.scatterplot( + x=np.arange(df.shape[0]), + y=df[f"clone{cid} RD"].values, + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=0.8, + legend=False, + ax=axes[2 * s], + ) + axes[2 * s].set_ylabel( + f"clone {cid}\nRDR" + if clone_names is None + else f"clone {clone_names[s]}\nRDR" + ) + axes[2 * s].set_yticks(np.arange(1, rdr_ylim, 1)) + axes[2 * s].set_ylim([0, rdr_ylim]) + axes[2 * s].set_xlim([0, df.shape[0]]) if remove_xticks: - axes[2*s].set_xticks([]) - seaborn.scatterplot(x=np.arange(df.shape[0]), y=df[f'clone{cid} BAF'].values, \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes[2*s+1]) - axes[2*s+1].set_ylabel(f"clone {cid}\nphased AF" if clone_names is None else f"clone {clone_names[s]}\nphased AF") - axes[2*s+1].set_ylim([-0.1, baf_ylim]) + axes[2 * s].set_xticks([]) + seaborn.scatterplot( + x=np.arange(df.shape[0]), + y=df[f"clone{cid} BAF"].values, + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=0.8, + legend=False, + ax=axes[2 * s + 1], + ) + axes[2 * s + 1].set_ylabel( + f"clone {cid}\nphased AF" + if clone_names is None + else f"clone {clone_names[s]}\nphased AF" + ) + axes[2 * s + 1].set_ylim([-0.1, baf_ylim]) if baf_yticks is None: - axes[2*s+1].set_yticks(np.arange(0, baf_ylim, 0.1)) + axes[2 * s + 1].set_yticks(np.arange(0, baf_ylim, 0.1)) else: - axes[2*s+1].set_yticks(baf_yticks) - axes[2*s+1].set_xlim([0, df.shape[0]]) + axes[2 * s + 1].set_yticks(baf_yticks) + axes[2 * s + 1].set_xlim([0, df.shape[0]]) if remove_xticks: - axes[2*s+1].set_xticks([]) - + axes[2 * s + 1].set_xticks([]) + for i in unique_chrs: median_len = np.percentile(np.where(df.CHR.values == i)[0], 45) max_len = np.max(np.where(df.CHR.values == i)[0]) - axes[-1].text(median_len-5, chrtext_shift, i, transform=axes[-1].get_xaxis_transform()) + axes[-1].text( + median_len - 5, + chrtext_shift, + i, + transform=axes[-1].get_xaxis_transform(), + ) if max_len + 1 < df.shape[0]: - for k in range(2*len(clone_ids)): + for k in range(2 * len(clone_ids)): axes[k].axvline(x=max_len, c="grey", linewidth=1) if add_legend: - a00 = plt.arrow(0,0, 0,0, - color='darkblue') - a10 = plt.arrow(0,0, 0,0, color='lightblue') - a11 = plt.arrow(0,0, 0,0, color='lightgray') - a20 = plt.arrow(0,0, 0,0, color='dimgray') - a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow') - a30 = plt.arrow(0,0, 0,0, color='gold') - a22 = plt.arrow(0,0, 0,0, color='navajowhite') - a31 = plt.arrow(0,0, 0,0, color='orange') - a40 = plt.arrow(0,0, 0,0, color='darkorange') - a32 = plt.arrow(0,0, 0,0, color='salmon') - a41 = plt.arrow(0,0, 0,0, color='red') - a50 = plt.arrow(0,0, 0,0, color='darkred') - a33 = plt.arrow(0,0, 0,0, color='plum') - a42 = plt.arrow(0,0, 0,0, color='orchid') - a51 = plt.arrow(0,0, 0,0, color='purple') - a60 = plt.arrow(0,0, 0,0, color='indigo') - axes[0].legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \ - ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \ - '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1)) + a00 = plt.arrow(0, 0, 0, 0, color="darkblue") + a10 = plt.arrow(0, 0, 0, 0, color="lightblue") + a11 = plt.arrow(0, 0, 0, 0, color="lightgray") + a20 = plt.arrow(0, 0, 0, 0, color="dimgray") + a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow") + a30 = plt.arrow(0, 0, 0, 0, color="gold") + a22 = plt.arrow(0, 0, 0, 0, color="navajowhite") + a31 = plt.arrow(0, 0, 0, 0, color="orange") + a40 = plt.arrow(0, 0, 0, 0, color="darkorange") + a32 = plt.arrow(0, 0, 0, 0, color="salmon") + a41 = plt.arrow(0, 0, 0, 0, color="red") + a50 = plt.arrow(0, 0, 0, 0, color="darkred") + a33 = plt.arrow(0, 0, 0, 0, color="plum") + a42 = plt.arrow(0, 0, 0, 0, color="orchid") + a51 = plt.arrow(0, 0, 0, 0, color="purple") + a60 = plt.arrow(0, 0, 0, 0, color="indigo") + axes[0].legend( + [ + a00, + a10, + a11, + a20, + a21, + a30, + a22, + a31, + a40, + a32, + a41, + a50, + a33, + a42, + a51, + a60, + ], + [ + "(0, 0)", + "(1, 0)", + "(1, 1)", + "(2, 0)", + "(2, 1)", + "(3, 0)", + "(2, 2)", + "(3, 1)", + "(4, 0)", + "(3, 2)", + "(4, 1)", + "(5, 0)", + "(3, 3)", + "(4, 2)", + "(5, 1)", + "(6, 0)", + ], + ncol=2, + loc="upper left", + bbox_to_anchor=(1, 1), + ) fig.tight_layout() fig.subplots_adjust(hspace=0.1) return fig, axes -def plot_2dscatter_rdrbaf(configuration_file, r_hmrf_initialization, cn_file, clone_ids=None, rdr_ylim=5, base_width=3.2, pointsize=15): +def plot_2dscatter_rdrbaf( + configuration_file, + r_hmrf_initialization, + cn_file, + clone_ids=None, + rdr_ylim=5, + base_width=3.2, + pointsize=15, +): # full palette palette, ordered_acn = get_full_palette() - map_cn = {x:i for i,x in enumerate(ordered_acn)} + map_cn = {x: i for i, x in enumerate(ordered_acn)} colors = [palette[c] for c in ordered_acn] try: config = read_configuration_file(configuration_file) except: config = read_joint_configuration_file(configuration_file) - + # load allele specific integer copy numbers df_cnv = pd.read_csv(cn_file, header=0, sep="\t") n_final_clones = int(df_cnv.columns[-1].split(" ")[0][5:]) + 1 @@ -1218,61 +2846,131 @@ def plot_2dscatter_rdrbaf(configuration_file, r_hmrf_initialization, cn_file, cl single_base_nb_mean = dat["single_base_nb_mean"] single_total_bb_RD = dat["single_total_bb_RD"] single_tumor_prop = dat["single_tumor_prop"] - res_combine = dict( np.load(f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", allow_pickle=True) ) + res_combine = dict( + np.load( + f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", + allow_pickle=True, + ) + ) assert single_X.shape[0] == df_cnv.shape[0] - clone_index = [np.where(res_combine["new_assignment"] == c)[0] for c in range(len( np.unique(res_combine["new_assignment"]) ))] + clone_index = [ + np.where(res_combine["new_assignment"] == c)[0] + for c in range(len(np.unique(res_combine["new_assignment"]))) + ] if config["tumorprop_file"] is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, clone_index) + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( + single_X, single_base_nb_mean, single_total_bb_RD, clone_index + ) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, clone_index, single_tumor_prop) + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( + single_X, + single_base_nb_mean, + single_total_bb_RD, + clone_index, + single_tumor_prop, + ) n_obs = X.shape[0] # plotting all clones if clone_ids is None: - fig, axes = plt.subplots(1, X.shape[2], figsize=(base_width*X.shape[2], base_width), dpi=200, facecolor="white") + fig, axes = plt.subplots( + 1, + X.shape[2], + figsize=(base_width * X.shape[2], base_width), + dpi=200, + facecolor="white", + ) for s in range(X.shape[2]): # major and minor allele copies give the hue - major = np.maximum(df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values) - minor = np.minimum(df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values) + major = np.maximum( + df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{s} A"].values, df_cnv[f"clone{s} B"].values + ) # plot points - seaborn.scatterplot(x=X[:,1,s]/total_bb_RD[:,s], y=X[:,0,s]/base_nb_mean[:,s], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s]) + seaborn.scatterplot( + x=X[:, 1, s] / total_bb_RD[:, s], + y=X[:, 0, s] / base_nb_mean[:, s], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[s], + ) axes[s].set_xlabel(f"clone {s}\nphased AF") axes[s].set_xlim([-0.1, 1.1]) axes[s].set_xticks([0, 0.5, 1]) axes[s].set_ylabel(f"clone {s}\nRDR") axes[s].set_yticks(np.arange(1, rdr_ylim, 1)) - axes[s].set_ylim([0,5]) + axes[s].set_ylim([0, 5]) fig.tight_layout() # plot a given clone else: - fig, axes = plt.subplots(1, len(clone_ids), figsize=(base_width*len(clone_ids), base_width), dpi=200, facecolor="white") - for s,cid in enumerate(clone_ids): + fig, axes = plt.subplots( + 1, + len(clone_ids), + figsize=(base_width * len(clone_ids), base_width), + dpi=200, + facecolor="white", + ) + for s, cid in enumerate(clone_ids): # major and minor allele copies give the hue - major = np.maximum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) - minor = np.minimum(df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values) + major = np.maximum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) + minor = np.minimum( + df_cnv[f"clone{cid} A"].values, df_cnv[f"clone{cid} B"].values + ) # plot points - seaborn.scatterplot(x=X[:,1,cid]/total_bb_RD[:,cid], y=X[:,0,cid]/base_nb_mean[:,cid], \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", alpha=0.8, legend=False, ax=axes[s]) + seaborn.scatterplot( + x=X[:, 1, cid] / total_bb_RD[:, cid], + y=X[:, 0, cid] / base_nb_mean[:, cid], + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + alpha=0.8, + legend=False, + ax=axes[s], + ) axes[s].set_xlabel(f"clone {cid}\nphased AF") axes[s].set_xlim([-0.1, 1.1]) axes[s].set_xticks([0, 0.5, 1]) axes[s].set_ylabel(f"clone {cid}\nRDR") axes[s].set_yticks(np.arange(1, rdr_ylim, 1)) - axes[s].set_ylim([0,5]) + axes[s].set_ylim([0, 5]) fig.tight_layout() return fig -def plot_2dscatter_rdrbaf_from_df(df, axes, cid, cname=None, baf_xlim=0.51, rdr_ylim=3, pointsize=15, linewidth=1, add_legend=False): +def plot_2dscatter_rdrbaf_from_df( + df, + axes, + cid, + cname=None, + baf_xlim=0.51, + rdr_ylim=3, + pointsize=15, + linewidth=1, + add_legend=False, +): """ Attributes ---------- @@ -1281,10 +2979,10 @@ def plot_2dscatter_rdrbaf_from_df(df, axes, cid, cname=None, baf_xlim=0.51, rdr_ """ # full palette palette, ordered_acn = get_full_palette() - map_cn = {x:i for i,x in enumerate(ordered_acn)} + map_cn = {x: i for i, x in enumerate(ordered_acn)} colors = [palette[c] for c in ordered_acn] - final_clone_ids = np.unique([ x.split(" ")[0][5:] for x in df.columns if "RD" in x ]) + final_clone_ids = np.unique([x.split(" ")[0][5:] for x in df.columns if "RD" in x]) assert cid in final_clone_ids unique_chrs = np.unique(df.CHR.values) @@ -1293,90 +2991,211 @@ def plot_2dscatter_rdrbaf_from_df(df, axes, cid, cname=None, baf_xlim=0.51, rdr_ minor = np.minimum(df[f"clone{cid} A"].values, df[f"clone{cid} B"].values) # plot points - seaborn.scatterplot(x=df[f'clone{cid} BAF'].values, y=df[f'clone{cid} RD'].values, \ - hue=pd.Categorical([map_cn[(major[i], minor[i])] for i in range(len(major))], categories=np.arange(len(ordered_acn)), ordered=True), \ - palette=seaborn.color_palette(colors), s=pointsize, edgecolor="black", linewidth=linewidth, alpha=0.8, legend=False, ax=axes) - axes.set_xlabel(f"clone {cid}\nphased AF" if cname is None else f"{cname}\nphased AF") + seaborn.scatterplot( + x=df[f"clone{cid} BAF"].values, + y=df[f"clone{cid} RD"].values, + hue=pd.Categorical( + [map_cn[(major[i], minor[i])] for i in range(len(major))], + categories=np.arange(len(ordered_acn)), + ordered=True, + ), + palette=seaborn.color_palette(colors), + s=pointsize, + edgecolor="black", + linewidth=linewidth, + alpha=0.8, + legend=False, + ax=axes, + ) + axes.set_xlabel( + f"clone {cid}\nphased AF" if cname is None else f"{cname}\nphased AF" + ) axes.set_xlim([-0.02, baf_xlim]) axes.set_xticks(np.arange(0, baf_xlim, 0.1)) axes.set_ylabel(f"clone {cid}\nRDR" if cname is None else f"{cname}\nRDR") axes.set_yticks(np.arange(1, rdr_ylim, 1)) - axes.set_ylim([0,rdr_ylim]) + axes.set_ylim([0, rdr_ylim]) if add_legend: - a00 = plt.arrow(0,0, 0,0, - color='darkblue') - a10 = plt.arrow(0,0, 0,0, color='lightblue') - a11 = plt.arrow(0,0, 0,0, color='lightgray') - a20 = plt.arrow(0,0, 0,0, color='dimgray') - a21 = plt.arrow(0,0, 0,0, color='lightgoldenrodyellow') - a30 = plt.arrow(0,0, 0,0, color='gold') - a22 = plt.arrow(0,0, 0,0, color='navajowhite') - a31 = plt.arrow(0,0, 0,0, color='orange') - a40 = plt.arrow(0,0, 0,0, color='darkorange') - a32 = plt.arrow(0,0, 0,0, color='salmon') - a41 = plt.arrow(0,0, 0,0, color='red') - a50 = plt.arrow(0,0, 0,0, color='darkred') - a33 = plt.arrow(0,0, 0,0, color='plum') - a42 = plt.arrow(0,0, 0,0, color='orchid') - a51 = plt.arrow(0,0, 0,0, color='purple') - a60 = plt.arrow(0,0, 0,0, color='indigo') - axes.legend([a00, a10, a11, a20, a21, a30, a22, a31, a40, a32, a41, a50, a33, a42, a51, a60], \ - ['(0, 0)','(1, 0)','(1, 1)','(2, 0)', '(2, 1)','(3, 0)', '(2, 2)','(3, 1)','(4, 0)','(3, 2)', \ - '(4, 1)','(5, 0)', '(3, 3)','(4, 2)','(5, 1)','(6, 0)'], ncol=2, loc='upper left', bbox_to_anchor=(1,1)) - - - -def plot_clones_in_space(coords, assignment, sample_list=None, sample_ids=None, palette="Set2", labels=None, label_coords=None, label_sample_ids=None): + a00 = plt.arrow(0, 0, 0, 0, color="darkblue") + a10 = plt.arrow(0, 0, 0, 0, color="lightblue") + a11 = plt.arrow(0, 0, 0, 0, color="lightgray") + a20 = plt.arrow(0, 0, 0, 0, color="dimgray") + a21 = plt.arrow(0, 0, 0, 0, color="lightgoldenrodyellow") + a30 = plt.arrow(0, 0, 0, 0, color="gold") + a22 = plt.arrow(0, 0, 0, 0, color="navajowhite") + a31 = plt.arrow(0, 0, 0, 0, color="orange") + a40 = plt.arrow(0, 0, 0, 0, color="darkorange") + a32 = plt.arrow(0, 0, 0, 0, color="salmon") + a41 = plt.arrow(0, 0, 0, 0, color="red") + a50 = plt.arrow(0, 0, 0, 0, color="darkred") + a33 = plt.arrow(0, 0, 0, 0, color="plum") + a42 = plt.arrow(0, 0, 0, 0, color="orchid") + a51 = plt.arrow(0, 0, 0, 0, color="purple") + a60 = plt.arrow(0, 0, 0, 0, color="indigo") + axes.legend( + [ + a00, + a10, + a11, + a20, + a21, + a30, + a22, + a31, + a40, + a32, + a41, + a50, + a33, + a42, + a51, + a60, + ], + [ + "(0, 0)", + "(1, 0)", + "(1, 1)", + "(2, 0)", + "(2, 1)", + "(3, 0)", + "(2, 2)", + "(3, 1)", + "(4, 0)", + "(3, 2)", + "(4, 1)", + "(5, 0)", + "(3, 3)", + "(4, 2)", + "(5, 1)", + "(6, 0)", + ], + ncol=2, + loc="upper left", + bbox_to_anchor=(1, 1), + ) + + +def plot_clones_in_space( + coords, + assignment, + sample_list=None, + sample_ids=None, + palette="Set2", + labels=None, + label_coords=None, + label_sample_ids=None, +): if (sample_list is None) or (len(sample_list) == 1): - fig, axes = plt.subplots(1, 1, figsize=(5.5,4), dpi=200, facecolor="white") - seaborn.scatterplot(x=coords[:,0], y=-coords[:,1], color="lightgrey", alpha=0.5, linewidth=0, s=15, ax=axes) - seaborn.scatterplot(x=coords[~assignment.isnull(),0], y=-coords[~assignment.isnull(),1], \ - hue=assignment[~assignment.isnull()], palette=palette, linewidth=0, s=15, ax=axes) - h,l = axes.get_legend_handles_labels() - axes.legend(h, l, loc="upper left", bbox_to_anchor=(1,1)) + fig, axes = plt.subplots(1, 1, figsize=(5.5, 4), dpi=200, facecolor="white") + seaborn.scatterplot( + x=coords[:, 0], + y=-coords[:, 1], + color="lightgrey", + alpha=0.5, + linewidth=0, + s=15, + ax=axes, + ) + seaborn.scatterplot( + x=coords[~assignment.isnull(), 0], + y=-coords[~assignment.isnull(), 1], + hue=assignment[~assignment.isnull()], + palette=palette, + linewidth=0, + s=15, + ax=axes, + ) + h, l = axes.get_legend_handles_labels() + axes.legend(h, l, loc="upper left", bbox_to_anchor=(1, 1)) if not labels is None: assert len(labels) == len(label_coords) - for i,c in enumerate(labels): - axes.text(label_coords[i][0]-4, -label_coords[i][1], c) + for i, c in enumerate(labels): + axes.text(label_coords[i][0] - 4, -label_coords[i][1], c) else: unique_assignments = np.sort(np.unique(assignment[~assignment.isnull()].values)) - fig, axes = plt.subplots(1, len(sample_list), figsize=(5*len(sample_list)+0.5,4), dpi=200, facecolor="white") + fig, axes = plt.subplots( + 1, + len(sample_list), + figsize=(5 * len(sample_list) + 0.5, 4), + dpi=200, + facecolor="white", + ) for s, sname in enumerate(sample_list): indexes = np.where(sample_ids == s)[0] - seaborn.scatterplot(x=coords[indexes,0], y=-coords[indexes,1], color="lightgrey", alpha=0.5, linewidth=0, s=15, ax=axes[s]) + seaborn.scatterplot( + x=coords[indexes, 0], + y=-coords[indexes, 1], + color="lightgrey", + alpha=0.5, + linewidth=0, + s=15, + ax=axes[s], + ) if s + 1 != len(sample_list): - seaborn.scatterplot(x=coords[indexes,0][~assignment.iloc[indexes].isnull()], y=-coords[indexes,1][~assignment.iloc[indexes].isnull()], \ - hue=pd.Categorical(assignment.iloc[indexes][~assignment.iloc[indexes].isnull()], categories=unique_assignments, ordered=True), \ - palette=palette, linewidth=0, s=15, legend=False, ax=axes[s]) + seaborn.scatterplot( + x=coords[indexes, 0][~assignment.iloc[indexes].isnull()], + y=-coords[indexes, 1][~assignment.iloc[indexes].isnull()], + hue=pd.Categorical( + assignment.iloc[indexes][~assignment.iloc[indexes].isnull()], + categories=unique_assignments, + ordered=True, + ), + palette=palette, + linewidth=0, + s=15, + legend=False, + ax=axes[s], + ) else: - seaborn.scatterplot(x=coords[indexes,0][~assignment.iloc[indexes].isnull()], y=-coords[indexes,1][~assignment.iloc[indexes].isnull()], \ - hue=pd.Categorical(assignment.iloc[indexes][~assignment.iloc[indexes].isnull()], categories=unique_assignments, ordered=True), \ - palette=palette, linewidth=0, s=15, ax=axes[s]) - h,l = axes[s].get_legend_handles_labels() - axes[s].legend(h, l, loc="upper left", bbox_to_anchor=(1,1)) + seaborn.scatterplot( + x=coords[indexes, 0][~assignment.iloc[indexes].isnull()], + y=-coords[indexes, 1][~assignment.iloc[indexes].isnull()], + hue=pd.Categorical( + assignment.iloc[indexes][~assignment.iloc[indexes].isnull()], + categories=unique_assignments, + ordered=True, + ), + palette=palette, + linewidth=0, + s=15, + ax=axes[s], + ) + h, l = axes[s].get_legend_handles_labels() + axes[s].legend(h, l, loc="upper left", bbox_to_anchor=(1, 1)) if not labels is None: - assert len(labels) == len(label_coords) and len(labels) == len(label_sample_ids) - for i,c in enumerate(labels): + assert len(labels) == len(label_coords) and len(labels) == len( + label_sample_ids + ) + for i, c in enumerate(labels): s = label_sample_ids[i] - axes[s].text(label_coords[i][0]-4, -label_coords[i][1], c) + axes[s].text(label_coords[i][0] - 4, -label_coords[i][1], c) fig.tight_layout() return fig -def plot_individual_spots_in_space(coords, assignment, single_tumor_prop=None, sample_list=None, sample_ids=None, base_width=4, base_height=3, palette="Set2"): +def plot_individual_spots_in_space( + coords, + assignment, + single_tumor_prop=None, + sample_list=None, + sample_ids=None, + base_width=4, + base_height=3, + palette="Set2", +): # combine coordinates across samples shifted_coords = copy.copy(coords) if not (sample_ids is None): x_offset = 0 - for s,sname in enumerate(sample_list): + for s, sname in enumerate(sample_list): index = np.where(sample_ids == s)[0] - shifted_coords[index,0] = shifted_coords[index,0] + x_offset - x_offset += np.max(coords[index,0]) + 10 + shifted_coords[index, 0] = shifted_coords[index, 0] + x_offset + x_offset += np.max(coords[index, 0]) + 10 # number of clones and samples final_clone_ids = np.unique(assignment[~assignment.isnull()].values) @@ -1387,27 +3206,80 @@ def plot_individual_spots_in_space(coords, assignment, single_tumor_prop=None, s if not single_tumor_prop is None: copy_single_tumor_prop = copy.copy(single_tumor_prop) copy_single_tumor_prop[np.isnan(copy_single_tumor_prop)] = 0.5 - - fig, axes = plt.subplots(1, 1, figsize=(base_width*n_samples, base_height), dpi=200, facecolor="white") + + fig, axes = plt.subplots( + 1, 1, figsize=(base_width * n_samples, base_height), dpi=200, facecolor="white" + ) if "clone 0" in final_clone_ids: - colorlist = ['lightgrey'] + seaborn.color_palette("Set2", n_final_clones-1).as_hex() + colorlist = ["lightgrey"] + seaborn.color_palette( + "Set2", n_final_clones - 1 + ).as_hex() else: colorlist = seaborn.color_palette("Set2", n_final_clones).as_hex() - for c,cid in enumerate(final_clone_ids): - idx = np.where( (assignment.values==cid) )[0] + for c, cid in enumerate(final_clone_ids): + idx = np.where((assignment.values == cid))[0] if single_tumor_prop is None: - seaborn.scatterplot(x=shifted_coords[idx,0], y=-shifted_coords[idx,1], s=10, color=colorlist[c], linewidth=0, legend=None, ax=axes) + seaborn.scatterplot( + x=shifted_coords[idx, 0], + y=-shifted_coords[idx, 1], + s=10, + color=colorlist[c], + linewidth=0, + legend=None, + ax=axes, + ) else: # cmap - this_full_cmap = seaborn.color_palette(f"blend:lightgrey,{colorlist[c]}", as_cmap=True) - quantile_colors = this_full_cmap(np.array([0, np.min(copy_single_tumor_prop[idx]), np.max(copy_single_tumor_prop[idx]), 1])) - quantile_colors = [matplotlib.colors.rgb2hex(x) for x in quantile_colors[1:-1]] - this_cmap = seaborn.color_palette(f"blend:{quantile_colors[0]},{quantile_colors[-1]}", as_cmap=True) - seaborn.scatterplot(x=shifted_coords[idx,0], y=-shifted_coords[idx,1], s=10, hue=copy_single_tumor_prop[idx], palette=this_cmap, linewidth=0, legend=None, ax=axes) - - legend_elements = [Line2D([0], [0], marker='o', color="w", markerfacecolor=colorlist[c], label=cid, markersize=10) for c,cid in enumerate(final_clone_ids)] - axes.legend(legend_elements, final_clone_ids, handlelength=0.1, loc="upper left", bbox_to_anchor=(1,1)) + this_full_cmap = seaborn.color_palette( + f"blend:lightgrey,{colorlist[c]}", as_cmap=True + ) + quantile_colors = this_full_cmap( + np.array( + [ + 0, + np.min(copy_single_tumor_prop[idx]), + np.max(copy_single_tumor_prop[idx]), + 1, + ] + ) + ) + quantile_colors = [ + matplotlib.colors.rgb2hex(x) for x in quantile_colors[1:-1] + ] + this_cmap = seaborn.color_palette( + f"blend:{quantile_colors[0]},{quantile_colors[-1]}", as_cmap=True + ) + seaborn.scatterplot( + x=shifted_coords[idx, 0], + y=-shifted_coords[idx, 1], + s=10, + hue=copy_single_tumor_prop[idx], + palette=this_cmap, + linewidth=0, + legend=None, + ax=axes, + ) + + legend_elements = [ + Line2D( + [0], + [0], + marker="o", + color="w", + markerfacecolor=colorlist[c], + label=cid, + markersize=10, + ) + for c, cid in enumerate(final_clone_ids) + ] + axes.legend( + legend_elements, + final_clone_ids, + handlelength=0.1, + loc="upper left", + bbox_to_anchor=(1, 1), + ) axes.axis("off") fig.tight_layout() From 4901b562d1290ddbfa3f380922d1593d56a87270 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 11:04:09 -0400 Subject: [PATCH 005/125] add logging and tidy for calicost_main --- src/calicost/calicost_main.py | 383 +++++++++++++++++++++++----------- 1 file changed, 264 insertions(+), 119 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index d64c102..b6985f4 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -1,50 +1,91 @@ +import copy +import functools +import logging +import subprocess import sys +import datetime +from pathlib import Path + +import anndata import numpy as np -import scipy import pandas as pd -from pathlib import Path -from sklearn.metrics import adjusted_rand_score -from sklearn.cluster import KMeans import scanpy as sc -import anndata -import logging +import scipy +from sklearn.cluster import KMeans +from sklearn.metrics import adjusted_rand_score -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) -logger = logging.getLogger() -import copy -from pathlib import Path -import functools -import subprocess from calicost.arg_parse import * +from calicost.find_integer_copynumber import * from calicost.hmm_NB_BB_phaseswitch import * -from calicost.utils_distribution_fitting import * -from calicost.utils_hmrf import * from calicost.hmrf import * +from calicost.parse_input import * from calicost.phasing import * +from calicost.utils_distribution_fitting import * +from calicost.utils_hmrf import * from calicost.utils_IO import * -from calicost.find_integer_copynumber import * -from calicost.parse_input import * from calicost.utils_plotting import * +""" +from calicost.hmm_NB_BB_nophasing_v2 import hmm_nophasing_v2 +from calicost.arg_parse import run_parse_n_load, genesnp_to_bininfo +from calicost.find_integer_copynumber import (hill_climbing_integer_copynumber_fixdiploid, + hill_climbing_integer_copynumber_oneclone) +from calicost.hmm_NB_BB_phaseswitch import (combine_similar_states_across_clones, + similarity_components_rdrbaf_neymanpearson) +from calicost.hmrf import (aggr_hmrf_reassignment, aggr_hmrfmix_reassignment, + hmrf_concatenate_pipeline, hmrf_reassignment_posterior, + hmrfmix_concatenate_pipeline, hmrfmix_reassignment_posterior, + merge_by_minspots) +from calicost.phasing import pipeline_baum_welch +from calicost.utils_hmrf import (load_hmrf_last_iteration, rectangle_initialize_initial_clone, + rectangle_initialize_initial_clone_mix, reorder_results) +from calicost.utils_IO import bin_selection_basedon_normal, expand_df_cnv, filter_de_genes_tri +from calicost.utils_plotting import (argparse, merge_pseudobulk_by_index, + merge_pseudobulk_by_index_mix, plot_acn_from_df, + plot_acn_from_df_anotherscheme, plot_clones_in_space, + plot_individual_spots_in_space, plot_rdr_baf, plt, + read_configuration_file, read_joint_configuration_file) +""" + +logger = logging.getLogger("calicost") +logger.setLevel(logging.INFO) + +handler = logging.StreamHandler(sys.stdout) +fhandler = logging.FileHandler('calicost.log', mode="w") + +formatter = logging.Formatter("%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s") + +handler.setFormatter(formatter) +fhandler.setFormatter(formatter) + +logger.addHandler(handler) +logger.addHandler(fhandler) def main(configuration_file): + start = datetime.datetime.now() + try: config = read_configuration_file(configuration_file) except: config = read_joint_configuration_file(configuration_file) - print("Configurations:") + + logger.info("Configuration settings:") + for k in sorted(list(config.keys())): - print(f"\t{k} : {config[k]}") + logger.info(f"\t{k} : {config[k]}") + + # NB assuming the B-allele counts are calculated by the cellsnp-lite & Eagle pipeline. If assuming each spot contains + # a mixture of normal/tumor cells, the tumor proportion path should be provided in the config file. + # + # NB load data: + # - If the data is loaded for the first time: infer phasing using phase-switch HMM + # (hmm_NB_BB_phaseswitch.py & phasing.py) with output initial_phase.npz, matrices + # in /parsed_inputs + # + # - If the data is already loaded: load the matrices from parsed_inputs folder + + logger.info(f"Running parse and load.") - # Assuming the B counts are calculated by the cellsnp-lite and Eagle pipeline - # If assuming each spot contains a mixture of normal/tumor cells, the tumor proportion should be provided in the config file. - # load data - ## If the data is loaded for the first time: infer phasing using phase-switch HMM (hmm_NB_BB_phaseswitch.py and phasing.py) -> output initial_phase.npz, matrices in parsed_inputs folder - ## If the data is already loaded: load the matrices from parsed_inputs folder ( lengths, single_X, @@ -63,25 +104,32 @@ def main(configuration_file): exp_counts, ) = run_parse_n_load(config) - """ - Initial clustering spots using only BAF values. - """ - # setting transcript count to 0, and baseline so that emission probability calculation will ignore them. + logger.info(f"**** Estimating initial clones using BAF only ****") + + # NB setting transcript & baseline count to 0 so the emission probability will be ignored. copy_single_X_rdr = copy.copy(single_X[:, 0, :]) copy_single_base_nb_mean = copy.copy(single_base_nb_mean) + single_X[:, 0, :] = 0 single_base_nb_mean[:, :] = 0 - # run HMRF for r_hmrf_initialization in range( config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"] ): + logger.info(f"Processing HMRF random realization {num_hmrf_initialization_start:d}") + outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}" + outdir = Path(outdir) + if config["tumorprop_file"] is None: + logger.info(f"Initializing clones ignoring tumor proportion.") + initial_clone_index = rectangle_initialize_initial_clone( coords, config["n_clones"], random_state=r_hmrf_initialization ) else: + logger.info(f"Initializing clones based on tumor proportion: {config["tumorprop_file"]}") + initial_clone_index = rectangle_initialize_initial_clone_mix( coords, config["n_clones"], @@ -90,27 +138,30 @@ def main(configuration_file): random_state=r_hmrf_initialization, ) - # create directory - p = subprocess.Popen( - f"mkdir -p {outdir}", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - ) - out, err = p.communicate() - # save clone initialization into npz file - prefix = "allspots" - if not Path(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz").exists(): + # NB save clone initialization to npz file + file_name = Path(f"allspots_nstates{config['n_states']}_sp.npz") + file_path = outdir / file_name + + if not file_path.exists(): + logger.info(f"Creating output dir: {str(outdir)}") + + # TODO exist_ok + outdir.mkdir(parents=True, exist_ok=True) + initial_assignment = np.zeros(single_X.shape[2], dtype=int) + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = {"num_iterations": 0, "round-1_assignment": initial_assignment} - np.savez(f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz", **allres) - # run HMRF + HMM - # store the results of each iteration of HMRF in a npz file outdir/prefix_nstates{config['n_states']}_sp.npz - # if a specific iteration is computed, hmrf will directly load the results from the file + np.savez(str(file_path), **{"num_iterations": 0, "round-1_assignment": initial_assignment}) + + # ---- HMRF + HMM ---- + # + # NB stores the results of each HMRF iteration in a .npz @ ./outdir/prefix_nstates{config['n_states']}_sp.npz + # if a specific iteration is already computed, hmrf will load the results directly from the file. if config["tumorprop_file"] is None: + logger.info("Solving HMRF concatenate pipeline without tumor proportion.") + hmrf_concatenate_pipeline( outdir, prefix, @@ -140,6 +191,8 @@ def main(configuration_file): spatial_weight=config["spatial_weight"], ) else: + logger.info("Solving HMRF concatenate pipeline with tumor proportion.") + hmrfmix_concatenate_pipeline( outdir, prefix, @@ -171,11 +224,13 @@ def main(configuration_file): tumorprop_threshold=config["tumorprop_threshold"], ) - # merge by thresholding BAF profile similarity + logger.info("Loading last HMRF iteration & merging clones based on BAF profile similarity threshold.") + + n_obs = single_X.shape[0] res = load_hmrf_last_iteration( f"{outdir}/{prefix}_nstates{config['n_states']}_sp.npz" ) - n_obs = single_X.shape[0] + if config["tumorprop_file"] is None: X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, @@ -200,8 +255,12 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], ) tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) - # merge "similar" clones from the initial number of clones. - # "similar" defined by Neyman Pearson statistics/ Likelihood ratios P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B) + + logger.info("Merged pseudo-bulk based on clone index.") + + # NB ratio == P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B) + logger.info("Merging similar initial clones based on Neyman-Pearson Likelihood ratio.") + merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson( X, base_nb_mean, @@ -213,8 +272,10 @@ def main(configuration_file): tumor_prop=tumor_prop, hmmclass=hmm_nophasing_v2, ) - print(f"BAF clone merging after comparing similarity: {merging_groups}") - # + + logger.info(f"BAF clone merging after comparing similarity: {merging_groups}") + logger.info(f"Merging similar initial clones based on min. spot threshold of {config["min_spots_per_clone"]}.") + if config["tumorprop_file"] is None: merging_groups, merged_res = merge_by_minspots( merged_res["new_assignment"], @@ -233,13 +294,20 @@ def main(configuration_file): single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"], ) - print(f"BAF clone merging after requiring minimum # spots: {merging_groups}") + + logger.info(f"BAF clone merging after requiring minimum # spots: {merging_groups}") + n_baf_clones = len(merging_groups) + + file_path = f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz" + + logger.info(f"Writing merged initial clones to {file_path}") + np.savez( - f"{outdir}/mergedallspots_nstates{config['n_states']}_sp.npz", **merged_res + file_path, **merged_res ) - # load merged results + # NB load merged results n_obs = single_X.shape[0] merged_res = dict( np.load( @@ -247,12 +315,16 @@ def main(configuration_file): allow_pickle=True, ) ) + merged_baf_assignment = copy.copy(merged_res["new_assignment"]) n_baf_clones = len(np.unique(merged_baf_assignment)) + + # TODO comment. pred = np.argmax(merged_res["log_gamma"], axis=0) pred = np.array( [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)] ) + merged_baf_profiles = np.array( [ np.where( @@ -264,18 +336,23 @@ def main(configuration_file): ] ) - """ - Refined clustering using BAF and RDR values. - """ - # adding RDR information + logger.info("Preparing refinement of initial, merged clones using BAF & RDR ****") + if not config["bafonly"]: - # Only used when assuming each spot is pure normal or tumor and if we don't know which spots are normal spots. - # select normal spots + # NB this block only used when assuming each spot is pure normal or pure tumor, + # and if we don't know which spots are normal spots. + # + # NB select normal spots + + logger.info("Identifying normal spots.") + if (config["normalidx_file"] is None) and ( config["tumorprop_file"] is None ): + # TODO hardcode EPS_BAF = 0.05 PERCENT_NORMAL = 40 + vec_stds = np.std(np.log1p(copy_single_X_rdr @ smooth_mat), axis=0) id_nearnormal_clone = np.argmin( np.sum( @@ -283,6 +360,7 @@ def main(configuration_file): axis=1, ) ) + while True: stdthreshold = np.percentile( vec_stds[merged_res["new_assignment"] == id_nearnormal_clone], @@ -298,18 +376,19 @@ def main(configuration_file): ): break PERCENT_NORMAL += 10 + pd.Series(barcodes[normal_candidate == True].index).to_csv( f"{outdir}/normal_candidate_barcodes.txt", header=False, index=False ) elif not config["normalidx_file"] is None: - # single_base_nb_mean has already been added in loading data step. + # NB single_base_nb_mean has been initialized in loading data step (run_parse_n_load - TBC). if not config["tumorprop_file"] is None: logger.warning( - f"Mixed sources of information for normal spots! Using {config['normalidx_file']}" + f"Mixed sources of information for normal spots! Using {config['normalidx_file']}" ) - # If tumor purity is provided, we can use it to select normal spots. + # NB if tumor purity is provided, we can use it to select normal spots. else: for prop_threshold in np.arange(0.05, 0.6, 0.05): normal_candidate = single_tumor_prop < prop_threshold @@ -318,8 +397,13 @@ def main(configuration_file): > single_X.shape[0] * 200 ): break - # To avoid allele-specific expression that are not relevant to CNA, filter bins where normal pseudobulk has large |BAF - 0.5| + + # NB avoid allele-specific expression that is not relevant to CNA by filtering bins where normal + # pseudobulk has large |BAF - 0.5| index_normal = np.where(normal_candidate)[0] + + logger.info("Filtering genomic bins for allele-specific expression based on normal spots.") + ( lengths, single_X, @@ -337,13 +421,18 @@ def main(configuration_file): index_normal, config["geneticmap_file"], ) + assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0] + df_bininfo = genesnp_to_bininfo(df_gene_snp) copy_single_X_rdr = copy.copy(single_X[:, 0, :]) - # If a gene has way higher expression than adjacent genes, its transcript count will dominate RDR values - # To avoid the domination, filter out high-UMI DE genes, which may bias RDR estimates - # Assume the remaining genes will still carry the CNA info. + # NB if a gene has much higher expression than adjacent genes, its transcripts will dominate RDR. + # To avoid this, filter out (high-UMI) DE genes, which may bias estimates, assuming the remaining + # genes will still carry the CNA info. + + logger.info("Filtering genes with expression outliers.") + copy_single_X_rdr, _ = filter_de_genes_tri( exp_counts, df_bininfo, @@ -351,7 +440,10 @@ def main(configuration_file): sample_list=sample_list, sample_ids=sample_ids, ) + + # TODO hardcode MIN_NORMAL_COUNT_PERBIN = 20 + bidx_inconfident = np.where( np.sum(copy_single_X_rdr[:, (normal_candidate == True)], axis=1) < MIN_NORMAL_COUNT_PERBIN @@ -361,19 +453,21 @@ def main(configuration_file): ) rdr_normal[bidx_inconfident] = 0 rdr_normal = rdr_normal / np.sum(rdr_normal) - copy_single_X_rdr[bidx_inconfident, :] = ( - 0 # avoid ill-defined distributions if normal has 0 count in that bin. - ) + + # NB avoid ill-defined distributions if normal has 0 counts in bin. + copy_single_X_rdr[bidx_inconfident, :] = 0 + copy_single_base_nb_mean = rdr_normal.reshape(-1, 1) @ np.sum( copy_single_X_rdr, axis=0 ).reshape(1, -1) - # adding back RDR signal + # NB restore RDR data. single_X[:, 0, :] = copy_single_X_rdr single_base_nb_mean = copy_single_base_nb_mean n_obs = single_X.shape[0] - # save binned data + logger.info(f"Writing {outdir}/binned_data.npz") + np.savez( f"{outdir}/binned_data.npz", lengths=lengths, @@ -386,16 +480,19 @@ def main(configuration_file): ), ) - # run HMRF on each clone individually to further split BAF clone by RDR+BAF signal + logger.info(f"**** Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR ****") + for bafc in range(n_baf_clones): + logger.info(f"Refining BAF clone {bafc}.") + prefix = f"clone{bafc}" idx_spots = np.where(merged_baf_assignment == bafc)[0] - if ( - np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20 - ): # put a minimum B allele read count on pseudobulk to split clones + + # NB put a minimum B allele read count on pseudobulk to split clones + if np.sum(single_total_bb_RD[:, idx_spots]) < single_X.shape[0] * 20: continue - # initialize clone - # write the initialization in a npz file outdir/prefix_nstates{config['n_states']}_smp.npz + + # NB initialize sub-clones within initial, merged BAF clone. if config["tumorprop_file"] is None: initial_clone_index = rectangle_initialize_initial_clone( coords[idx_spots], @@ -410,24 +507,27 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], random_state=r_hmrf_initialization, ) - if not Path( - f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz" - ).exists(): + + # NB write the initialization to .npz @ ./outdir/prefix_nstates{config['n_states']}_smp.npz + file_path = Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz") + + if not file_path.exists(): initial_assignment = np.zeros(len(idx_spots), dtype=int) + for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c + allres = { "barcodes": barcodes[idx_spots], "num_iterations": 0, "round-1_assignment": initial_assignment, } - np.savez( - f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz", - **allres, - ) + + np.savez(str(file_path), **allres) - # HMRF + HMM using RDR information + # HMRF + HMM with RDR copy_slice_sample_ids = copy.copy(sample_ids[idx_spots]) + if config["tumorprop_file"] is None: hmrf_concatenate_pipeline( outdir, @@ -489,9 +589,11 @@ def main(configuration_file): tumorprop_threshold=config["tumorprop_threshold"], ) - ##### combine results across clones ##### + logger.info(f"Combining results across clones.") + res_combine = {"prev_assignment": np.zeros(single_X.shape[2], dtype=int)} offset_clone = 0 + for bafc in range(n_baf_clones): prefix = f"clone{bafc}" allres = dict( @@ -515,7 +617,9 @@ def main(configuration_file): "prev_assignment": allres[f"round{r-1}_assignment"], "new_assignment": allres[f"round{r}_assignment"], } + idx_spots = np.where(barcodes.isin(allres["barcodes"]))[0] + if len(np.unique(res["new_assignment"])) == 1: n_merged_clones = 1 c = res["new_assignment"][0] @@ -559,6 +663,9 @@ def main(configuration_file): ) ) tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) + + logger.info(f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio.") + merging_groups, merged_res = ( similarity_components_rdrbaf_neymanpearson( X, @@ -572,8 +679,9 @@ def main(configuration_file): hmmclass=hmm_nophasing_v2, ) ) - print(f"part {bafc} merging_groups: {merging_groups}") - # + + logger.info(f"BAF+RDR clone {bafc}: merging_groups={merging_groups}") + if config["tumorprop_file"] is None: merging_groups, merged_res = merge_by_minspots( merged_res["new_assignment"], @@ -594,12 +702,16 @@ def main(configuration_file): single_tumor_prop=single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"], ) - print( - f"part {bafc} merging after requiring minimum # spots: {merging_groups}" + + # TODO what is merging_groups + logger.info( + f"BAF+RDR clone {bafc} merging after requiring minimum # spots: {merging_groups}" ) - # compute posterior using the newly merged pseudobulk + + # NB compute posterior using the newly merged pseudobulk n_merged_clones = len(merging_groups) tmp = copy.copy(merged_res["new_assignment"]) + if config["tumorprop_file"] is None: X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X[:, :, idx_spots], @@ -625,7 +737,9 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], ) ) - # + + logger.info(f"Running Baum-Welch with refined & merged BAF+RDR clones.") + merged_res = pipeline_baum_welch( None, np.vstack( @@ -660,6 +774,9 @@ def main(configuration_file): sample_length=np.ones(X.shape[2], dtype=int) * X.shape[0], ) merged_res["new_assignment"] = copy.copy(tmp) + + logger.info("Combining similar states across clones.") + merged_res = combine_similar_states_across_clones( X, base_nb_mean, @@ -689,8 +806,8 @@ def main(configuration_file): for c in range(n_merged_clones) ] ).T - # - # add to res_combine + + # NB add to res_combine if len(res_combine) == 1: res_combine.update( { @@ -739,13 +856,17 @@ def main(configuration_file): merged_res["new_assignment"] + offset_clone ) offset_clone += n_merged_clones - # temp: make dispersions the same across all clones + + # NB temp: make dispersions the same across all clones res_combine["new_alphas"][:, :] = np.max(res_combine["new_alphas"]) res_combine["new_taus"][:, :] = np.min(res_combine["new_taus"]) - # end temp + # NB end temp + n_final_clones = len(np.unique(res_combine["prev_assignment"])) - # per-sample weights across clones + + # NB per-sample weights across clones log_persample_weights = np.zeros((n_final_clones, len(sample_list))) + for sidx in range(len(sample_list)): index = np.where(sample_ids == sidx)[0] this_persample_weight = np.bincount( @@ -757,8 +878,10 @@ def main(configuration_file): log_persample_weights[:, sidx] = log_persample_weights[ :, sidx ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) - # final re-assignment across all clones using estimated RDR + BAF - # The following step may not be needed because of other improvements. And it may cause mistakes in some cases. + + # NB final re-assignment across all clones using estimated RDR + BAF + # The following step may not be needed because of other improvements + # and it may cause errors in some cases. if config["tumorprop_file"] is None: if config["nodepotential"] == "max": pred = np.vstack( @@ -767,6 +890,9 @@ def main(configuration_file): for c in range(res_combine["log_gamma"].shape[2]) ] ).T + + logger.info("Aggregating HMRF reassignment with Viterbi.") + new_assignment, single_llf, total_llf, posterior = ( aggr_hmrf_reassignment( single_X, @@ -785,6 +911,8 @@ def main(configuration_file): ) ) elif config["nodepotential"] == "weighted_sum": + logger.info("Reassigning HMRF posterior.") + new_assignment, single_llf, total_llf, posterior = ( hmrf_reassignment_posterior( single_X, @@ -809,6 +937,9 @@ def main(configuration_file): for c in range(res_combine["log_gamma"].shape[2]) ] ).T + + logger.info("Aggregating HMRF mix reassignment with Viterbi.") + new_assignment, single_llf, total_llf, posterior = ( aggr_hmrfmix_reassignment( single_X, @@ -828,6 +959,8 @@ def main(configuration_file): ) ) elif config["nodepotential"] == "weighted_sum": + logger.info("Reassigning HMRF mix posterior.") + new_assignment, single_llf, total_llf, posterior = ( hmrfmix_reassignment_posterior( single_X, @@ -847,18 +980,25 @@ def main(configuration_file): ) res_combine["total_llf"] = total_llf res_combine["new_assignment"] = new_assignment - # re-order clones such that normal clones are always clone 0 + + # NB re-order clones such that normal clones are always clone 0 res_combine, posterior = reorder_results( res_combine, posterior, single_tumor_prop ) - # save results + + logger.info(f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz") + np.savez( f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine, ) + + logger.info(f"Writing {outdir}/posterior_clone_probability.npy") + np.save(f"{outdir}/posterior_clone_probability.npy", posterior) - ##### infer integer copy ##### + logger.info("Inferring integer copy numbers") + res_combine = dict( np.load( f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", @@ -867,11 +1007,13 @@ def main(configuration_file): ) final_clone_ids = np.sort(np.unique(res_combine["new_assignment"])) nonempty_clone_ids = copy.copy(final_clone_ids) + # add clone 0 as normal clone if it doesn't appear in final_clone_ids if not (0 in final_clone_ids): final_clone_ids = np.append(0, final_clone_ids) # chr position medfix = ["", "_diploid", "_triploid", "_tetraploid"] + for o, max_medploidy in enumerate([None, 2, 3, 4]): # A/B copy number per bin allele_specific_copy = [] @@ -955,10 +1097,10 @@ def main(configuration_file): finding_distate_failed = True continue - print( + logger.info( f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}" ) - # + allele_specific_copy.append( pd.DataFrame( best_integer_copies[ @@ -977,7 +1119,7 @@ def main(configuration_file): columns=np.arange(n_obs), ) ) - # + state_cnv.append( pd.DataFrame( res_combine["new_log_mu"][:, s].reshape(-1, 1), @@ -985,6 +1127,7 @@ def main(configuration_file): index=np.arange(config["n_states"]), ) ) + state_cnv.append( pd.DataFrame( res_combine["new_p_binom"][:, s].reshape(-1, 1), @@ -1006,7 +1149,7 @@ def main(configuration_file): index=np.arange(config["n_states"]), ) ) - # + # DEPRECATE # tmpdf = get_genelevel_cnv_oneclone(best_integer_copies[res_combine["pred_cnv"][:,s], 0], best_integer_copies[res_combine["pred_cnv"][:,s], 1], x_gene_list) # tmpdf.columns = [f"clone{s} A", f"clone{s} B"] bin_Acopy_mappers = { @@ -1042,13 +1185,15 @@ def main(configuration_file): ) if len(state_cnv) == 0: continue - # output gene-level copy number + + # NB output gene-level copy number df_genelevel_cnv.to_csv( f"{outdir}/cnv{medfix[o]}_genelevel.tsv", header=True, index=True, sep="\t", ) + # output segment-level copy number allele_specific_copy = pd.concat(allele_specific_copy) df_seglevel_cnv = pd.DataFrame( @@ -1091,22 +1236,21 @@ def main(configuration_file): # smooth_mat, adjacency_mat, res_combine["new_assignment"], sample_ids, base_nb_mean, log_persample_weights, config["spatial_weight"], hmmclass=hmm_nophasing_v2) # df_posterior.to_pickle(f"{outdir}/posterior{medfix[o]}.pkl") - ##### output clone label ##### df_clone_label = pd.DataFrame( {"clone_label": res_combine["new_assignment"]}, index=barcodes ) if not config["tumorprop_file"] is None: df_clone_label["tumor_proportion"] = single_tumor_prop + + logger.info(f"Writing clone labels to {outdir}/clone_labels.tsv") + df_clone_label.to_csv( f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t" ) - ##### plotting ##### - # make a directory for plots - p = subprocess.Popen(f"mkdir -p {outdir}/plots", shell=True) - out, err = p.communicate() + Path(f"{outdir}/plots").mkdir(parents=True, exist_ok=True) - # plot RDR and BAF + # NB plot RDR and BAF. cn_file = f"{outdir}/cnv_diploid_seglevel.tsv" fig = plot_rdr_baf( configuration_file, @@ -1125,7 +1269,8 @@ def main(configuration_file): transparent=True, bbox_inches="tight", ) - # plot allele-specific copy number + + # NB plot allele-specific copy number for o, max_medploidy in enumerate([None, 2, 3, 4]): cn_file = f"{outdir}/cnv{medfix[o]}_seglevel.tsv" if not Path(cn_file).exists(): @@ -1262,4 +1407,4 @@ def main(configuration_file): ) args = parser.parse_args() - main(args.configfile) + main(args.configfile) \ No newline at end of file From 00813b3104545df1123010ebfa26791fb1a6eb14 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 11:26:34 -0400 Subject: [PATCH 006/125] add logging info for hmrf_concatenate_pipeline. --- src/calicost/hmrf.py | 94 ++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index ccc8f0c..e61f13e 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1,32 +1,33 @@ +import copy import logging -from turtle import reset +import warnings +from pathlib import Path +# from turtle import reset + +# import networkx as nx import numpy as np import pandas as pd -from numba import njit -import scipy.special import scipy.sparse -from sklearn.mixture import GaussianMixture +import scipy.special +from numba import njit from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score, silhouette_score +from sklearn.mixture import GaussianMixture from sklearn.neighbors import kneighbors_graph -import networkx as nx +from statsmodels.tools.sm_exceptions import ValueWarning from tqdm import trange -import copy -from pathlib import Path + from calicost.hmm_NB_BB_phaseswitch import * from calicost.utils_distribution_fitting import * -from calicost.utils_IO import * from calicost.utils_hmrf import * +from calicost.utils_IO import * -import warnings -from statsmodels.tools.sm_exceptions import ValueWarning - +logger = logging.getLogger(__name__) ############################################################ # Pure clone ############################################################ - def hmrf_reassignment_posterior( single_X, single_base_nb_mean, @@ -813,9 +814,12 @@ def hmrf_concatenate_pipeline( unit_ysquared=3, spatial_weight=1.0, ): + logger.info("Solving hmrf_concatenate_pipeline.") + n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) - # checking input + + # NB checking input assert not (coords is None and adjacency_mat is None) if adjacency_mat is None: adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared) @@ -827,13 +831,18 @@ def hmrf_concatenate_pipeline( n_samples = len(unique_sample_ids) tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) + log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) - # pseudobulk + + logger.info("Merging pseudobulk by clone index") + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index ) - # initialize HMM parameters by GMM + if (init_log_mu is None) or (init_p_binom is None): + logger.info("Initializing HMM parameters by GMM") + init_log_mu, init_p_binom = initialization_by_gmm( n_states, np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( @@ -846,7 +855,10 @@ def hmrf_concatenate_pipeline( in_log_space=False, only_minor=False, ) - # initialization parameters for HMM + else: + logger.info("Using provided HMM initialization parameters") + + # NB initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu last_p_binom = init_p_binom @@ -862,14 +874,23 @@ def hmrf_concatenate_pipeline( for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c - # HMM + logger.info(f"Computing HMM for {max_iter_outer} iterations.") + for r in range(max_iter_outer): - # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization + # NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. + # When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should + # contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization + logger.info(f"Loading {outdir}/{prefix}_nstates{n_states}_{params}.npz") + allres = np.load( f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True ) allres = dict(allres) + + # TODO reads in existing iteration results if required. if allres["num_iterations"] > r: + logger.info(f"Loading pre-computed HMM results for iteration {r}.") + res = { "new_log_mu": allres[f"round{r}_new_log_mu"], "new_alphas": allres[f"round{r}_new_alphas"], @@ -885,6 +906,8 @@ def hmrf_concatenate_pipeline( "new_assignment": allres[f"round{r}_assignment"], } else: + logger.info(f"Computing HMM iteration {r}.") + res = pipeline_baum_welch( None, np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( @@ -912,8 +935,11 @@ def hmrf_concatenate_pipeline( tol=tol, ) pred = np.argmax(res["log_gamma"], axis=0) - # HMRF clone assignmment + + # NB HMRF clone assignmment if nodepotential == "max": + logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate.") + new_assignment, single_llf, total_llf = ( aggr_hmrf_reassignment_concatenate( single_X, @@ -931,6 +957,8 @@ def hmrf_concatenate_pipeline( ) ) elif nodepotential == "weighted_sum": + logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate.") + new_assignment, single_llf, total_llf = ( hmrf_reassignment_posterior_concatenate( single_X, @@ -947,8 +975,9 @@ def hmrf_concatenate_pipeline( ) ) else: - raise Exception("Unknown mode for nodepotential!") - # handle the case when one clone has zero spots + raise ValueError("Unknown mode for nodepotential!") + + # NB handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) @@ -959,10 +988,11 @@ def hmrf_concatenate_pipeline( ) res["log_gamma"] = res["log_gamma"][:, concat_idx] res["pred_cnv"] = res["pred_cnv"][concat_idx] - # + res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf + # append to allres for k, v in res.items(): if k == "prev_assignment": @@ -971,10 +1001,15 @@ def hmrf_concatenate_pipeline( allres[f"round{r}_assignment"] = v else: allres[f"round{r}_{k}"] = v + allres["num_iterations"] = r + 1 + + logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz") + np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) - # - # regroup to pseudobulk + + logger.info(f"Regrouping to pseudobulk for iteration {r}.") + clone_index = [ np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) @@ -982,9 +1017,9 @@ def hmrf_concatenate_pipeline( X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, clone_index ) - # + if "mp" in params: - print( + logger.info( "outer iteration {}: difference between parameters = {}, {}".format( r, np.mean(np.abs(last_log_mu - res["new_log_mu"])), @@ -992,18 +1027,19 @@ def hmrf_concatenate_pipeline( ) ) elif "m" in params: - print( + logger.info( "outer iteration {}: difference between NB parameters = {}".format( r, np.mean(np.abs(last_log_mu - res["new_log_mu"])) ) ) elif "p" in params: - print( + logger.info( "outer iteration {}: difference between BetaBinom parameters = {}".format( r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) ) ) - print( + + logger.info( "outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) From 0ecfca4f9d3cc8cbaa5b59a27daafd6f7cd8cb45 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 11:37:33 -0400 Subject: [PATCH 007/125] add logging for hmrfmix_concatenate_pipeline --- src/calicost/calicost_main.py | 4 ++ src/calicost/hmrf.py | 73 +++++++++++++++++++++++++++-------- 2 files changed, 60 insertions(+), 17 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index b6985f4..10aee62 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -1395,6 +1395,10 @@ def main(configuration_file): bbox_inches="tight", ) + end = datetime.datetime.now() + runtime = end - start + + logging.info(f"Complete in {runtime} [seconds].") if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index e61f13e..6630068 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1862,9 +1862,12 @@ def hmrfmix_concatenate_pipeline( spatial_weight=1.0 / 6, tumorprop_threshold=0.5, ): + logger.info("Solving hmrfix_concatenate_pipeline.") + n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) - # spot adjacency matric + + # NB checking inputs assert not (coords is None and adjacency_mat is None) if adjacency_mat is None: adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared) @@ -1877,7 +1880,9 @@ def hmrfmix_concatenate_pipeline( tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * (-np.log(n_clones)) - # pseudobulk + + logger.info("Merging pseudobulk by clone index") + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( single_X, single_base_nb_mean, @@ -1886,10 +1891,13 @@ def hmrfmix_concatenate_pipeline( single_tumor_prop, threshold=tumorprop_threshold, ) - # baseline proportion of UMI counts + + # NB baseline proportion of UMI counts lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) - # initialize HMM parameters by GMM + if (init_log_mu is None) or (init_p_binom is None): + logger.info("Initializing HMM parameters by GMM") + init_log_mu, init_p_binom = initialization_by_gmm( n_states, np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( @@ -1902,7 +1910,10 @@ def hmrfmix_concatenate_pipeline( in_log_space=False, only_minor=False, ) - # initialization parameters for HMM + else: + logger.info("Using provided HMM initialization parameters") + + # NB initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu last_p_binom = init_p_binom @@ -1918,14 +1929,26 @@ def hmrfmix_concatenate_pipeline( for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c - # HMM + logger.info(f"Computing HMM for {max_iter_outer} iterations.") + for r in range(max_iter_outer): - # assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization + """ + NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. + When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should + contain two keys: "num_iterations" and f"round_-1_assignment" for clone + initialization + """ + logger.info(f"Loading {outdir}/{prefix}_nstates{n_states}_{params}.npz") + allres = np.load( f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", allow_pickle=True ) allres = dict(allres) + + # TODO reads in existing iteration results if required. if allres["num_iterations"] > r: + logger.info(f"Loading pre-computed HMM results for iteration {r}.") + res = { "new_log_mu": allres[f"round{r}_new_log_mu"], "new_alphas": allres[f"round{r}_new_alphas"], @@ -1943,8 +1966,12 @@ def hmrfmix_concatenate_pipeline( else: sample_length = np.ones(X.shape[2], dtype=int) * X.shape[0] remain_kwargs = {"sample_length": sample_length, "lambd": lambd} + if f"round{r-1}_log_gamma" in allres: remain_kwargs["log_gamma"] = allres[f"round{r-1}_log_gamma"] + + logger.info(f"Computing HMM iteration {r}.") + res = pipeline_baum_welch( None, np.vstack([X[:, 0, :].flatten("F"), X[:, 1, :].flatten("F")]).T.reshape( @@ -1973,9 +2000,13 @@ def hmrfmix_concatenate_pipeline( tol=tol, **remain_kwargs, ) + pred = np.argmax(res["log_gamma"], axis=0) - # clone assignmment + + # NB HMRF clone assignmment if nodepotential == "max": + logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment_concatenate.") + new_assignment, single_llf, total_llf = ( aggr_hmrfmix_reassignment_concatenate( single_X, @@ -1994,6 +2025,8 @@ def hmrfmix_concatenate_pipeline( ) ) elif nodepotential == "weighted_sum": + logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior_concatenate.") + new_assignment, single_llf, total_llf = ( hmrfmix_reassignment_posterior_concatenate( single_X, @@ -2011,8 +2044,9 @@ def hmrfmix_concatenate_pipeline( ) ) else: - raise Exception("Unknown mode for nodepotential!") - # handle the case when one clone has zero spots + raise ValueError("Unknown mode for nodepotential!") + + # NB handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) @@ -2036,9 +2070,13 @@ def hmrfmix_concatenate_pipeline( else: allres[f"round{r}_{k}"] = v allres["num_iterations"] = r + 1 + + logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz") + np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) - # - # regroup to pseudobulk + + logger.info(f"Regrouping to pseudobulk for iteration {r}.") + clone_index = [ np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) @@ -2051,9 +2089,9 @@ def hmrfmix_concatenate_pipeline( single_tumor_prop, threshold=tumorprop_threshold, ) - # + if "mp" in params: - print( + logger.info( "outer iteration {}: difference between parameters = {}, {}".format( r, np.mean(np.abs(last_log_mu - res["new_log_mu"])), @@ -2061,18 +2099,19 @@ def hmrfmix_concatenate_pipeline( ) ) elif "m" in params: - print( + logger.info( "outer iteration {}: difference between NB parameters = {}".format( r, np.mean(np.abs(last_log_mu - res["new_log_mu"])) ) ) elif "p" in params: - print( + logger.info( "outer iteration {}: difference between BetaBinom parameters = {}".format( r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) ) ) - print( + + logger.info( "outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) From 7eb9ba1fd5bb2ad174170601620f533d38fcd26c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 11:50:33 -0400 Subject: [PATCH 008/125] add logging for hmm_NB_BB_nophasing --- src/calicost/hmm_NB_BB_nophasing.py | 91 ++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index 2a262aa..2450d2f 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -1,25 +1,26 @@ +import copy import logging + +import networkx as nx import numpy as np -from numba import njit -from scipy.stats import norm, multivariate_normal, poisson import scipy.special -from scipy.optimize import minimize -from scipy.optimize import Bounds -from sklearn.mixture import GaussianMixture -from tqdm import trange import statsmodels.api as sm +from numba import njit +from scipy.optimize import Bounds, minimize +from scipy.stats import multivariate_normal, norm, poisson +from sklearn.mixture import GaussianMixture from statsmodels.base.model import GenericLikelihoodModel -import copy +from tqdm import trange + from calicost.utils_distribution_fitting import * from calicost.utils_hmm import * -import networkx as nx +logger = logging.getLogger(__name__) ############################################################ # whole inference ############################################################ - class hmm_nophasing(object): def __init__(self, params="stmp", t=1 - 1e-4): """ @@ -34,7 +35,6 @@ def __init__(self, params="stmp", t=1 - 1e-4): self.params = params self.t = t - # @staticmethod def compute_emission_probability_nb_betabinom( X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus @@ -68,16 +68,20 @@ def compute_emission_probability_nb_betabinom( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ + logger.info("Computing emission probability for negative binomial & beta binomial.") + n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] n_states = log_mu.shape[0] - # initialize log_emission + + # NB initialize log_emission log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) log_emission_baf = np.zeros((n_states, n_obs, n_spots)) + for i in np.arange(n_states): for s in np.arange(n_spots): - # expression from NB distribution + # NB expression from NB distribution. Mask is used explicity to separate BAF and BAF+RDR. idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) @@ -86,7 +90,7 @@ def compute_emission_probability_nb_betabinom( log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( X[idx_nonzero_rdr, 0, s], n, p ) - # AF from BetaBinom distribution + # NB AF from BetaBinom distribution idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: log_emission_baf[i, idx_nonzero_baf, s] = ( @@ -97,9 +101,11 @@ def compute_emission_probability_nb_betabinom( (1 - p_binom[i, s]) * taus[i, s], ) ) + + logger.info("Computed emission probability for negative binomial & beta binomial.") + return log_emission_rdr, log_emission_baf - # @staticmethod def compute_emission_probability_nb_betabinom_mix( X, @@ -141,10 +147,13 @@ def compute_emission_probability_nb_betabinom_mix( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ + logger.info("Computing emission probability for *mixed* negative binomial & beta binommial.") + n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] n_states = log_mu.shape[0] + # initialize log_emission log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) log_emission_baf = np.zeros((n_states, n_obs, n_spots)) @@ -183,9 +192,11 @@ def compute_emission_probability_nb_betabinom_mix( mix_p_A * taus[i, s], mix_p_B * taus[i, s], ) + + logger.info("Computed emission probability for *mixed* negative binomial & beta binommial.") + return log_emission_rdr, log_emission_baf - # @staticmethod @njit def forward_lattice( @@ -230,7 +241,6 @@ def forward_lattice( cumlen += le return log_alpha - # @staticmethod @njit def backward_lattice( @@ -276,7 +286,6 @@ def backward_lattice( cumlen += le return log_beta - # def run_baum_welch_nb_bb( self, X, @@ -314,7 +323,9 @@ def run_baum_welch_nb_bb( n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 - # initialize NB logmean shift and BetaBinom prob + + logger.info("Initialize NB logmean shift, BetaBinom prob and dispersion param inverse.") + log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None @@ -325,29 +336,37 @@ def run_baum_welch_nb_bb( if init_p_binom is None else init_p_binom ) - # initialize (inverse of) dispersion param in NB and BetaBinom + + # NB initialize (inverse of) dispersion param in NB and BetaBinom alphas = ( 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus - # initialize start probability and emission probability + + # NB initialize start probability and emission prob. log_startprob = np.log(np.ones(n_states) / n_states) + if n_states > 1: transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: log_transmat = np.zeros((1, 1)) - # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) + + # NB trick to speed up BetaBinom optimization: taking only unique values of + # (B allele count, total SNP covering read count) + logger.info("Constructing unique values matrix for NB and BB.") + unique_values_nb, mapping_matrices_nb = construct_unique_matrix( X[:, 0, :], base_nb_mean ) unique_values_bb, mapping_matrices_bb = construct_unique_matrix( X[:, 1, :], total_bb_RD ) - # EM algorithm - for r in trange(max_iter): - # E step + + for r in trange(max_iter, desc="EM algorithm"): + logger.info(f"Calculating E-step for iteration {r} of {max_iter}.") + if tumor_prop is None: log_emission_rdr, log_emission_baf = ( hmm_nophasing.compute_emission_probability_nb_betabinom( @@ -369,6 +388,7 @@ def run_baum_welch_nb_bb( ) ) log_emission = log_emission_rdr + log_emission_baf + log_alpha = hmm_nophasing.forward_lattice( lengths, log_transmat, @@ -376,6 +396,7 @@ def run_baum_welch_nb_bb( log_emission, log_sitewise_transmat, ) + log_beta = hmm_nophasing.backward_lattice( lengths, log_transmat, @@ -383,20 +404,26 @@ def run_baum_welch_nb_bb( log_emission, log_sitewise_transmat, ) + log_gamma = compute_posterior_obs(log_alpha, log_beta) + log_xi = compute_posterior_transition_nophasing( log_alpha, log_beta, log_transmat, log_emission ) - # M step + + logger.info(f"Calculating M-step for iteration {r} of {max_iter}.") + if "s" in self.params: new_log_startprob = update_startprob_nophasing(lengths, log_gamma) new_log_startprob = new_log_startprob.flatten() else: new_log_startprob = log_startprob + if "t" in self.params: new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag) else: new_log_transmat = log_transmat + if "m" in self.params: if tumor_prop is None: new_log_mu, new_alphas = ( @@ -426,6 +453,7 @@ def run_baum_welch_nb_bb( else: new_log_mu = log_mu new_alphas = alphas + if "p" in self.params: if tumor_prop is None: new_p_binom, new_taus = ( @@ -455,26 +483,31 @@ def run_baum_welch_nb_bb( else: new_p_binom = p_binom new_taus = taus - # check convergence - print( + + logger.info( + "EM convergence metrics", np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), np.mean(np.abs(new_log_mu - log_mu)), np.mean(np.abs(new_p_binom - p_binom)), ) - print(np.hstack([new_log_mu, new_p_binom])) + + logger.info(np.hstack([new_log_mu, new_p_binom])) + if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol ): break + log_startprob = new_log_startprob log_transmat = new_log_transmat log_mu = new_log_mu alphas = new_alphas p_binom = new_p_binom taus = new_taus + return ( new_log_mu, new_alphas, @@ -483,4 +516,4 @@ def run_baum_welch_nb_bb( new_log_startprob, new_log_transmat, log_gamma, - ) + ) \ No newline at end of file From 3b10e3f01f95f04c89ff199e1e78ecad2ab192e9 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 11:58:11 -0400 Subject: [PATCH 009/125] adding logging for update params in utils_hmm --- src/calicost/utils_hmm.py | 55 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 2a22f4d..9145ae5 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -1,11 +1,15 @@ -import numpy as np -from numba import njit import copy +import logging + +import numpy as np import scipy.special -from tqdm import trange +from numba import njit from sklearn.mixture import GaussianMixture +from tqdm import trange + from calicost.utils_distribution_fitting import * +logger = logging.getLogger(__name__) @njit def np_max_ax_squeeze(arr, axis=0): @@ -462,9 +466,12 @@ def update_emission_params_nb_sitewise_uniqvalues( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ + logger.info("Computing emission params for Negative Binomial (sitewise, unique).") + n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) + # initialization new_log_mu = ( copy.copy(start_log_mu) @@ -472,6 +479,7 @@ def update_emission_params_nb_sitewise_uniqvalues( else np.zeros((n_states, n_spots)) ) new_alphas = copy.copy(alphas) + # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) @@ -617,6 +625,9 @@ def update_emission_params_nb_sitewise_uniqvalues( new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr + + logger.info("Computed emission params for Negative Binomial (sitewise, unique).") + return new_log_mu, new_alphas @@ -645,6 +656,8 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ + logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique).") + n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) @@ -822,6 +835,9 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr + + logger.info("Computed emission params for Negative Binomial Mix (sitewise, unique).") + return new_log_mu, new_alphas @@ -850,6 +866,8 @@ def update_emission_params_bb_sitewise_uniqvalues( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ + logger.info("Computing emission params for Beta Binomial (sitewise, unique).") + n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) @@ -1042,6 +1060,9 @@ def update_emission_params_bb_sitewise_uniqvalues( new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob + + logger.info("Computed emission params for Beta Binomial (sitewise, unique).") + return new_p_binom, new_taus @@ -1071,6 +1092,8 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ + logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique).") + n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) @@ -1293,6 +1316,9 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob + + logger.info("Computed emission params for Beta Binomial Mix (sitewise, unique).") + return new_p_binom, new_taus @@ -1381,6 +1407,9 @@ def update_emission_params_nb_nophasing_uniqvalues( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ + + logger.info("Computing emission params for Negative Binomial (no phasing, unique).") + n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) @@ -1532,6 +1561,9 @@ def update_emission_params_nb_nophasing_uniqvalues( new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr + + logger.info("Computed emission params for Negative Binomial (no phasing, unique).") + return new_log_mu, new_alphas @@ -1559,6 +1591,8 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ + logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique).") + n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) @@ -1733,6 +1767,9 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( new_alphas[:, :] = res2.params[-1] new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr + + logger.info("Computed emission params for Negative Binomial Mix (no phasing, unique).") + return new_log_mu, new_alphas @@ -1760,6 +1797,8 @@ def update_emission_params_bb_nophasing_uniqvalues( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ + logger.info("Computing emission params for Beta Binomial (no phasing, unique).") + n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) @@ -1912,6 +1951,9 @@ def update_emission_params_bb_nophasing_uniqvalues( new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob + + logger.info("Computed emission params for Beta Binomial (no phasing, unique).") + return new_p_binom, new_taus @@ -1940,6 +1982,8 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ + logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique).") + n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) @@ -2121,4 +2165,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( new_taus[:, :] = res2.params[-1] new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob - return new_p_binom, new_taus + + logger.info("Computed emission params for Beta Binomial Mix (no phasing, unique).") + + return new_p_binom, new_taus \ No newline at end of file From a5ddc3992c37dedabc3fc2dc90d4f329e2aeef73 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 12:09:12 -0400 Subject: [PATCH 010/125] add logging of hmm_NB_BB_nophasing_v2 --- src/calicost/hmm_NB_BB_nophasing.py | 4 +- src/calicost/hmm_NB_BB_nophasing_v2.py | 75 +++++++++++++++++--------- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index 2450d2f..32d94f3 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -324,7 +324,7 @@ def run_baum_welch_nb_bb( n_spots = X.shape[2] assert n_comp == 2 - logger.info("Initialize NB logmean shift, BetaBinom prob and dispersion param inverse.") + logger.info("Initialize Baum-Welch NB logmean shift, BetaBinom prob and dispersion param inverse.") log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T @@ -508,6 +508,8 @@ def run_baum_welch_nb_bb( p_binom = new_p_binom taus = new_taus + logger.info("Computed Baum-Welch (v2).") + return ( new_log_mu, new_alphas, diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 2563834..a4408f6 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -1,18 +1,21 @@ +import copy import logging + +import networkx as nx import numpy as np -from numba import njit -from scipy.stats import norm, multivariate_normal, poisson import scipy.special -from scipy.optimize import minimize -from scipy.optimize import Bounds -from sklearn.mixture import GaussianMixture -from tqdm import trange import statsmodels.api as sm +from numba import njit +from scipy.optimize import Bounds, minimize +from scipy.stats import multivariate_normal, norm, poisson +from sklearn.mixture import GaussianMixture from statsmodels.base.model import GenericLikelihoodModel -import copy +from tqdm import trange + from calicost.utils_distribution_fitting import * from calicost.utils_hmm import * -import networkx as nx + +logger = logging.getLogger(__name__) """ Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution. @@ -37,7 +40,6 @@ def __init__(self, params="stmp", t=1 - 1e-4): self.params = params self.t = t - # @staticmethod def compute_emission_probability_nb_betabinom( X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus @@ -71,6 +73,8 @@ def compute_emission_probability_nb_betabinom( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ + logger.info("Computing emission probability for negative binomial & beta binomial.") + n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] @@ -80,7 +84,7 @@ def compute_emission_probability_nb_betabinom( log_emission_baf = np.zeros((n_states, n_obs, n_spots)) for i in np.arange(n_states): for s in np.arange(n_spots): - # expression from NB distribution + # NB expression from NB distribution. Mask is used explicity to separate BAF and BAF+RDR. idx_nonzero_rdr = np.where(base_nb_mean[:, s] > 0)[0] if len(idx_nonzero_rdr) > 0: nb_mean = base_nb_mean[idx_nonzero_rdr, s] * np.exp(log_mu[i, s]) @@ -89,7 +93,7 @@ def compute_emission_probability_nb_betabinom( log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf( X[idx_nonzero_rdr, 0, s], n, p ) - # AF from BetaBinom distribution + # NB AF from BetaBinom distribution idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] if len(idx_nonzero_baf) > 0: log_emission_baf[i, idx_nonzero_baf, s] = ( @@ -100,9 +104,11 @@ def compute_emission_probability_nb_betabinom( (1 - p_binom[i, s]) * taus[i, s], ) ) + + logger.info("Computed emission probability for negative binomial & beta binomial.") + return log_emission_rdr, log_emission_baf - # @staticmethod def compute_emission_probability_nb_betabinom_mix( X, @@ -144,6 +150,8 @@ def compute_emission_probability_nb_betabinom_mix( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ + logger.info("Computing emission probability for *mixed* negative binomial & beta binommial.") + n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] @@ -202,9 +210,11 @@ def compute_emission_probability_nb_betabinom_mix( mix_p_A * taus[i, s], mix_p_B * taus[i, s], ) + + logger.info("Computed emission probability for *mixed* negative binomial & beta binommial.") + return log_emission_rdr, log_emission_baf - # @staticmethod @njit def forward_lattice( @@ -249,7 +259,6 @@ def forward_lattice( cumlen += le return log_alpha - # @staticmethod @njit def backward_lattice( @@ -295,7 +304,6 @@ def backward_lattice( cumlen += le return log_beta - # def run_baum_welch_nb_bb( self, X, @@ -332,7 +340,9 @@ def run_baum_welch_nb_bb( n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 - # initialize NB logmean shift and BetaBinom prob + + logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse.") + log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None @@ -343,12 +353,13 @@ def run_baum_welch_nb_bb( if init_p_binom is None else init_p_binom ) - # initialize (inverse of) dispersion param in NB and BetaBinom + # NB initialize (inverse of) dispersion param in NB and BetaBinom alphas = ( 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus - # initialize start probability and emission probability + + # NB initialize start probability and emission probability log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) @@ -358,16 +369,19 @@ def run_baum_welch_nb_bb( log_transmat = np.zeros((1, 1)) # initialize log_gamma log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None - # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) + + # NB a trick to speed up BetaBinom optimization: taking only unique + # values of (B allele count, total SNP covering read count) unique_values_nb, mapping_matrices_nb = construct_unique_matrix( X[:, 0, :], base_nb_mean ) unique_values_bb, mapping_matrices_bb = construct_unique_matrix( X[:, 1, :], total_bb_RD ) - # EM algorithm - for r in trange(max_iter): - # E step + + for r in trange(max_iter, desc="EM algorithm"): + logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.") + if tumor_prop is None: log_emission_rdr, log_emission_baf = ( hmm_nophasing_v2.compute_emission_probability_nb_betabinom( @@ -428,6 +442,7 @@ def run_baum_welch_nb_bb( ) ) log_emission = log_emission_rdr + log_emission_baf + log_alpha = hmm_nophasing_v2.forward_lattice( lengths, log_transmat, @@ -435,6 +450,7 @@ def run_baum_welch_nb_bb( log_emission, log_sitewise_transmat, ) + log_beta = hmm_nophasing_v2.backward_lattice( lengths, log_transmat, @@ -442,11 +458,15 @@ def run_baum_welch_nb_bb( log_emission, log_sitewise_transmat, ) + log_gamma = compute_posterior_obs(log_alpha, log_beta) + log_xi = compute_posterior_transition_nophasing( log_alpha, log_beta, log_transmat, log_emission ) - # M step + + logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.") + if "s" in self.params: new_log_startprob = update_startprob_nophasing(lengths, log_gamma) new_log_startprob = new_log_startprob.flatten() @@ -545,14 +565,16 @@ def run_baum_welch_nb_bb( else: new_p_binom = p_binom new_taus = taus + # check convergence - print( + logger.info( + "EM convergence metrics (v2)", np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), np.mean(np.abs(new_log_mu - log_mu)), np.mean(np.abs(new_p_binom - p_binom)), ) - print(np.hstack([new_log_mu, new_p_binom])) + logger.info(np.hstack([new_log_mu, new_p_binom])) if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol @@ -565,6 +587,9 @@ def run_baum_welch_nb_bb( alphas = new_alphas p_binom = new_p_binom taus = new_taus + + logger.info("Computed Baum-Welch (v2).") + return ( new_log_mu, new_alphas, From 4d23df4d31984fce246d4d9a4169629d2130cf8d Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 12:15:31 -0400 Subject: [PATCH 011/125] add logging to hmm_NB_BB_phaseswitch --- src/calicost/hmm_NB_BB_phaseswitch.py | 76 ++++++++++++++++++--------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 0d26b70..07995ed 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -1,21 +1,23 @@ +import copy import logging + +import networkx as nx import numpy as np -from numba import njit -from scipy.stats import norm, multivariate_normal, poisson import scipy.special -from scipy.optimize import minimize -from scipy.optimize import Bounds -from sklearn.mixture import GaussianMixture -from tqdm import trange import statsmodels.api as sm +from numba import njit +from scipy.optimize import Bounds, minimize +from scipy.stats import multivariate_normal, norm, poisson +from sklearn.mixture import GaussianMixture from statsmodels.base.model import GenericLikelihoodModel -import copy -from calicost.utils_hmm import * -from calicost.utils_distribution_fitting import * +from tqdm import trange + from calicost.hmm_NB_BB_nophasing import * from calicost.hmm_NB_BB_nophasing_v2 import * -import networkx as nx +from calicost.utils_distribution_fitting import * +from calicost.utils_hmm import * +logger = logging.getLogger(__name__) ############################################################ # whole inference @@ -36,7 +38,6 @@ def __init__(self, params="stmp", t=1 - 1e-4): self.params = params self.t = t - # @staticmethod def compute_emission_probability_nb_betabinom( X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus @@ -70,6 +71,8 @@ def compute_emission_probability_nb_betabinom( log_emission : array, shape (2*n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ + logger.info("Computing emission probability for negative binomial & beta binomial (sitewise).") + n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] @@ -110,9 +113,11 @@ def compute_emission_probability_nb_betabinom( p_binom[i, s] * taus[i, s], ) ) + + logger.info("Computed emission probability for negative binomial & beta binomial (sitewise).") + return log_emission_rdr, log_emission_baf - # @staticmethod def compute_emission_probability_nb_betabinom_mix( X, @@ -154,6 +159,8 @@ def compute_emission_probability_nb_betabinom_mix( log_emission : array, shape (2*n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ + logger.info("Computing emission probability for *mixed* negative binomial & beta binomial (sitewise).") + n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] @@ -204,9 +211,11 @@ def compute_emission_probability_nb_betabinom_mix( mix_p_B * taus[i, s], mix_p_A * taus[i, s], ) + + logger.info("Computed emission probability for *mixed* negative binomial & beta binomial (sitewise).") + return log_emission_rdr, log_emission_baf - # @staticmethod @njit def forward_lattice( @@ -274,7 +283,6 @@ def forward_lattice( cumlen += le return log_alpha - # @staticmethod @njit def backward_lattice( @@ -338,7 +346,6 @@ def backward_lattice( cumlen += le return log_beta - # def run_baum_welch_nb_bb( self, X, @@ -374,7 +381,9 @@ def run_baum_welch_nb_bb( n_comp = X.shape[1] n_spots = X.shape[2] assert n_comp == 2 - # initialize NB logmean shift and BetaBinom prob + + logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise).") + log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None @@ -385,12 +394,14 @@ def run_baum_welch_nb_bb( if init_p_binom is None else init_p_binom ) - # initialize (inverse of) dispersion param in NB and BetaBinom + + # NB initialize (inverse of) dispersion param in NB and BetaBinom alphas = ( 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus - # initialize start probability and emission probability + + # NB initialize start probability and emission probability log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) @@ -398,16 +409,19 @@ def run_baum_welch_nb_bb( log_transmat = np.log(transmat) else: log_transmat = np.zeros((1, 1)) - # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) + + # NB a trick to speed up BetaBinom optimization: taking only unique values of + # (B allele count, total SNP covering read count) unique_values_nb, mapping_matrices_nb = construct_unique_matrix( X[:, 0, :], base_nb_mean ) unique_values_bb, mapping_matrices_bb = construct_unique_matrix( X[:, 1, :], total_bb_RD ) - # EM algorithm - for r in trange(max_iter): - # E step + + for r in trange(max_iter, desc="EM algorithm (sitewise)"): + logger.info(f"Calculating E-step (sitewise) for iteration {r} of {max_iter}.") + if tumor_prop is None: log_emission_rdr, log_emission_baf = ( hmm_sitewise.compute_emission_probability_nb_betabinom( @@ -429,6 +443,7 @@ def run_baum_welch_nb_bb( ) ) log_emission = log_emission_rdr + log_emission_baf + log_alpha = hmm_sitewise.forward_lattice( lengths, log_transmat, @@ -436,6 +451,7 @@ def run_baum_welch_nb_bb( log_emission, log_sitewise_transmat, ) + log_beta = hmm_sitewise.backward_lattice( lengths, log_transmat, @@ -443,11 +459,15 @@ def run_baum_welch_nb_bb( log_emission, log_sitewise_transmat, ) + log_gamma = compute_posterior_obs(log_alpha, log_beta) + log_xi = compute_posterior_transition_sitewise( log_alpha, log_beta, log_transmat, log_emission ) - # M step + + logger.info(f"Calculating M-step (sitewise) for iteration {r} of {max_iter}.") + if "s" in self.params: new_log_startprob = update_startprob_sitewise(lengths, log_gamma) new_log_startprob = new_log_startprob.flatten() @@ -522,13 +542,14 @@ def run_baum_welch_nb_bb( new_p_binom = p_binom new_taus = taus # check convergence - print( + logger.info( + "EM convergence metrics (sitewise)", np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), np.mean(np.abs(new_log_mu - log_mu)), np.mean(np.abs(new_p_binom - p_binom)), ) - print(np.hstack([new_log_mu, new_p_binom])) + logger.info((np.hstack([new_log_mu, new_p_binom])) if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol @@ -541,6 +562,9 @@ def run_baum_welch_nb_bb( alphas = new_alphas p_binom = new_p_binom taus = new_taus + + logger.info("Computed Baum-Welch (sitewise).") + return ( new_log_mu, new_alphas, @@ -1554,4 +1578,4 @@ def combine_similar_states_across_clones( # merged_res["total_llf"] = np.NAN # merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) # merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) -# return merging_groups, merged_res +# return merging_groups, merged_res \ No newline at end of file From 29235a2127049dd7efdb240aa9a32bf744588eda Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 12:52:28 -0400 Subject: [PATCH 012/125] add logging to hmrf and utils_IO. --- src/calicost/hmrf.py | 27 ++++++++++++++++++--------- src/calicost/utils_IO.py | 30 ++++++++++++++++++------------ 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 6630068..c6dd459 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -57,10 +57,12 @@ def hmrf_reassignment_posterior( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) # node potential new_assignment = copy.copy(prev_assignment) - # + posterior = np.zeros((N, n_clones)) - for i in trange(N): + logger.info("Computing hmrf_reassignment_posterior") + + for i in trange(N, desc="hmrf_reassignment_posterior"): idx = smooth_mat[i, :].nonzero()[1] for c in range(n_clones): tmp_log_emission_rdr, tmp_log_emission_baf = ( @@ -164,10 +166,10 @@ def aggr_hmrf_reassignment( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + posterior = np.zeros((N, n_clones)) - for i in trange(N): + for i in trange(N, desc="aggr_hmrf_reassignment"): idx = smooth_mat[i, :].nonzero()[1] # idx = np.append(idx, np.array([i])) for c in range(n_clones): @@ -252,10 +254,10 @@ def hmrf_reassignment_posterior_concatenate( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + posterior = np.zeros((N, n_clones)) - for i in trange(N): + for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"): idx = smooth_mat[i, :].nonzero()[1] tmp_log_emission_rdr, tmp_log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( @@ -398,10 +400,10 @@ def aggr_hmrf_reassignment_concatenate( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + posterior = np.zeros((N, n_clones)) - for i in trange(N): + for i in trange(N, desc="aggr_hmrf_reassignment_concatenate"): idx = smooth_mat[i, :].nonzero()[1] # idx = np.append(idx, np.array([i])) tmp_log_emission_rdr, tmp_log_emission_baf = ( @@ -472,6 +474,8 @@ def merge_by_minspots( single_tumor_prop=None, threshold=0.5, ): + logger.info("Merging by min. spots.") + n_clones = len(np.unique(assignment)) if n_clones == 1: merged_groups = [[assignment[0]]] @@ -554,6 +558,9 @@ def merge_by_minspots( for c in merging_groups ] ) + + logger.info("Merged by min. spots.") + return merging_groups, merged_res @@ -591,6 +598,8 @@ def hmrf_pipeline( unit_ysquared=3, spatial_weight=1.0, ): + logger.info("Solving hmrf_pipeline.") + n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) # spot adjacency matric @@ -637,7 +646,7 @@ def hmrf_pipeline( last_assignment = np.zeros(single_X.shape[2], dtype=int) for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c - # HMM + for r in range(max_iter_outer): if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists(): ##### initialize with the parameters of last iteration ##### diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index 82138a2..bda22d8 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -1,28 +1,31 @@ +import copy +import logging import sys +from pathlib import Path + +import anndata import numpy as np -import scipy -import copy import pandas as pd -from pathlib import Path +import scanpy as sc +import scipy +from sklearn.cluster import KMeans +from sklearn.kernel_ridge import KernelRidge from sklearn.metrics import adjusted_rand_score from sklearn.neighbors import LocalOutlierFactor -from sklearn.kernel_ridge import KernelRidge -from sklearn.cluster import KMeans -import scanpy as sc -import anndata -import logging - +""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) -logger = logging.getLogger() +""" +logger = logging.getLogger(__name__) -from calicost.utils_phase_switch import * -from calicost.utils_distribution_fitting import * import subprocess +from calicost.utils_distribution_fitting import * +from calicost.utils_phase_switch import * + def load_data( spaceranger_dir, @@ -43,6 +46,8 @@ def load_data( f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!" ) + raise RuntimeError() + adata.layers["count"] = adata.X.A.astype(int) cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz") cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz") @@ -247,6 +252,7 @@ def load_joint_data( logging.error( f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!" ) + raise RuntimeError() adatatmp.layers["count"] = adatatmp.X.A # reorder anndata spots to have the same order as df_this_barcode From 96869395c0754621abf5b7cb0e9c3a7cab2db3cf Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 12:59:14 -0400 Subject: [PATCH 013/125] add logging to hmrf_pipeline --- src/calicost/hmrf.py | 59 +++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index c6dd459..322b61e 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -602,8 +602,10 @@ def hmrf_pipeline( n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) - # spot adjacency matric + + # NB checking input assert not (coords is None and adjacency_mat is None) + if adjacency_mat is None: adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared) if sample_ids is None: @@ -615,12 +617,16 @@ def hmrf_pipeline( tmp_map_index = {unique_sample_ids[i]: i for i in range(len(unique_sample_ids))} sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) - # pseudobulk + + logger.info("Merging pseudobulk by clone index") + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index ) - # initialize HMM parameters by GMM + if (init_log_mu is None) or (init_p_binom is None): + logger.info("Initializing HMM parameters by GMM") + init_log_mu, init_p_binom = initialization_by_gmm( n_states, X, @@ -631,7 +637,10 @@ def hmrf_pipeline( in_log_space=False, only_minor=False, ) - # initialization parameters for HMM + else: + logger.info("Using provided HMM initialization parameters") + + # NB initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu last_p_binom = init_p_binom @@ -647,9 +656,13 @@ def hmrf_pipeline( for c, idx in enumerate(initial_clone_index): last_assignment[idx] = c + logger.info(f"Computing HMM for {max_iter_outer} iterations.") + for r in range(max_iter_outer): + # NB initialize with the parameters of last iteration if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists(): - ##### initialize with the parameters of last iteration ##### + logger.info(f"Computing HMM iteration {r}.") + res = pipeline_baum_welch( None, X, @@ -674,9 +687,13 @@ def hmrf_pipeline( max_iter=max_iter, tol=tol, ) + pred = np.argmax(res["log_gamma"], axis=0) + # clone assignmment if nodepotential == "max": + logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment.") + new_assignment, single_llf, total_llf = aggr_hmrf_reassignment( single_X, single_base_nb_mean, @@ -692,6 +709,8 @@ def hmrf_pipeline( hmmclass=hmmclass, ) elif nodepotential == "weighted_sum": + logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior.") + new_assignment, single_llf, total_llf = hmrf_reassignment_posterior( single_X, single_base_nb_mean, @@ -706,36 +725,42 @@ def hmrf_pipeline( hmmclass=hmmclass, ) else: - raise Exception("Unknown mode for nodepotential!") - # handle the case when one clone has zero spots + raise ValueError("Unknown mode for nodepotential!") + + # NB handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) - # + res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf - # save results + logger.info(f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz") + np.savez(f"{outdir}/round{r}_nstates{n_states}_{params}.npz", **res) else: + logger.info(f"Loading pre-computed HMM results for iteration {r}.") + logger.info(f"Loading {outdir}/round{r}_nstates{n_states}_{params}.npz") + res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz") - # regroup to pseudobulk + logger.info(f"Regrouping to pseudobulk for iteration {r}.") + clone_index = [ np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) ] + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, clone_index ) - # update last parameter if "mp" in params: - print( + logger.info( "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], @@ -744,7 +769,7 @@ def hmrf_pipeline( ) ) elif "m" in params: - print( + logger.info( "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], @@ -752,29 +777,33 @@ def hmrf_pipeline( ) ) elif "p" in params: - print( + logger.info( "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom - res["new_p_binom"])), ) ) - print( + + logger.info( "outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) + if ( adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1 ): break + last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] last_alphas = res["new_alphas"] last_taus = res["new_taus"] last_assignment = res["new_assignment"] log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) + for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] this_persample_weight = np.bincount( From 1da81bf501827899ae3db25421ffa942986ef164 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:12:09 -0400 Subject: [PATCH 014/125] add logging on Weighted_NegativeBinomial --- src/calicost/utils_distribution_fitting.py | 60 ++++++++++++++++------ 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 6f6ec02..a710a2e 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -1,25 +1,31 @@ import functools import inspect import logging +import os +import time import numpy as np import scipy -from scipy import linalg, special -from scipy.special import logsumexp, loggamma import scipy.integrate import scipy.stats +import statsmodels +import statsmodels.api as sm from numba import jit, njit +from scipy import linalg, special +from scipy.special import loggamma, logsumexp from sklearn import cluster from sklearn.utils import check_random_state -import statsmodels -import statsmodels.api as sm from statsmodels.base.model import GenericLikelihoodModel -import os -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["OMP_NUM_THREADS"] = "1" +logger = logging.getLogger(__name__) + +num_threads = "2" + +logger.info(f"Setting number of threads for MKL/BLAS/LAPACK/OMP to {num_threads}.") +os.environ["MKL_NUM_THREADS"] = num_threads +os.environ["OPENBLAS_NUM_THREADS"] = num_threads +os.environ["OMP_NUM_THREADS"] = num_threads def convert_params(mean, std): """ @@ -29,6 +35,7 @@ def convert_params(mean, std): """ p = mean / std**2 n = mean * p / (1.0 - p) + return n, p @@ -51,35 +58,56 @@ class Weighted_NegativeBinomial(GenericLikelihoodModel): exposure : array, (n_samples,) Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ - def __init__(self, endog, exog, weights, exposure, seed=0, **kwds): super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds) + + logger.info(f"Initializing Weighted_NegativeBinomial model for endog.shape = {endog.shape}.") + self.weights = weights self.exposure = exposure self.seed = seed - # def nloglikeobs(self, params): nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) + n, p = convert_params(nb_mean, nb_std) - llf = scipy.stats.nbinom.logpmf(self.endog, n, p) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # + return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) + def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): self.exog_names.append("alpha") + if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - return super(Weighted_NegativeBinomial, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + logger.info(f"Starting Weighted_NegativeBinomial optimization with start_params = {start_params}.") + + start = time.time() + + # NB see https://www.statsmodels.org/dev/dev/generated/statsmodels.base.model.LikelihoodModelResults.html + result = super(Weighted_NegativeBinomial, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + disp=False, + skip_hessian=True, + callback=None, + full_output=True, + retall=False, + **kwds ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + + logger.info(f"Finished Weighted_NegativeBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + + return result + class Weighted_NegativeBinomial_mix(GenericLikelihoodModel): def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds): From 46f692bba7bd307f28bdf0c9c20c09662f367fdf Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:20:57 -0400 Subject: [PATCH 015/125] add logging of emission fitting. --- src/calicost/utils_distribution_fitting.py | 115 ++++++++++++++++----- 1 file changed, 89 insertions(+), 26 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index a710a2e..81b9e9b 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -75,7 +75,7 @@ def nloglikeobs(self, params): return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): self.exog_names.append("alpha") if start_params is None: @@ -112,34 +112,56 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): class Weighted_NegativeBinomial_mix(GenericLikelihoodModel): def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds): super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds) + + logger.info(f"Initializing Weighted_NegativeBinomial_mix model for endog.shape = {endog.shape}.") + self.weights = weights self.exposure = exposure self.seed = seed self.tumor_prop = tumor_prop - # def nloglikeobs(self, params): nb_mean = self.exposure * ( self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop ) nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) + n, p = convert_params(nb_mean, nb_std) - llf = scipy.stats.nbinom.logpmf(self.endog, n, p) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): + return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) + + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): self.exog_names.append("alpha") + if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - return super(Weighted_NegativeBinomial_mix, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + + logger.info(f"Starting Weighted_NegativeBinomial_mix optimization with start_params = {start_params}.") + + start = time.time() + + result = super(Weighted_NegativeBinomial_mix, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + disp=False, + skip_hessian=True, + callback=None, + full_output=True, + retall=False, + **kwds ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + + logger.info(f"Finished Weighted_NegativeBinomial_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + + return result + class Weighted_BetaBinom(GenericLikelihoodModel): """ @@ -160,23 +182,23 @@ class Weighted_BetaBinom(GenericLikelihoodModel): exposure : array, (n_samples,) Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. """ - def __init__(self, endog, exog, weights, exposure, **kwds): super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds) + + logger.info(f"Initializing Weighted_BetaBinomial model for endog.shape = {endog.shape}.") + self.weights = weights self.exposure = exposure - # def nloglikeobs(self, params): a = (self.exog @ params[:-1]) * params[-1] b = (1 - self.exog @ params[:-1]) * params[-1] - llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): self.exog_names.append("tau") + if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params @@ -184,34 +206,55 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): start_params = np.append( 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) - return super(Weighted_BetaBinom, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + + logger.info(f"Starting Weighted_BetaBinomial optimization with start_params = {start_params}.") + + start = time.time() + + result = super(Weighted_BetaBinom, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + disp=False, + skip_hessian=True, + callback=None, + full_output=True, + retall=False, + **kwds ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + + logger.info(f"Finished Weighted_BetaBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + + return result class Weighted_BetaBinom_mix(GenericLikelihoodModel): def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds): super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds) + + logger.info(f"Initializing Weighted_BetaBinom_mix model for endog.shape = {endog.shape}.") + self.weights = weights self.exposure = exposure self.tumor_prop = tumor_prop - # def nloglikeobs(self, params): a = ( self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop) ) * params[-1] + b = ( (1 - self.exog @ params[:-1]) * self.tumor_prop + 0.5 * (1 - self.tumor_prop) ) * params[-1] - llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): self.exog_names.append("tau") + if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params @@ -219,10 +262,30 @@ def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): start_params = np.append( 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) - return super(Weighted_BetaBinom_mix, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + + logger.info(f"Starting Weighted_BetaBinom_mix optimization with start_params = {start_params}.") + + start = time.time() + + result = super(Weighted_BetaBinom_mix, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + disp=False, + skip_hessian=True, + callback=None, + full_output=True, + retall=False, + **kwds ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + + logger.info(f"Finished Weighted_BetaBinom_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + + return result + class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): def __init__(self, endog, exog, tau, weights, exposure, **kwds): From b44b9994e4eb5fad543e4bf02b8edf02aa0d250d Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:25:33 -0400 Subject: [PATCH 016/125] finish logging utils distribution fitting --- src/calicost/utils_distribution_fitting.py | 71 +++++++++++++++++----- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 81b9e9b..858957c 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -290,64 +290,103 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): def __init__(self, endog, exog, tau, weights, exposure, **kwds): super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds) + + logger.info(f"Initializing Weighted_BetaBinom_fixdispersion model for endog.shape = {endog.shape}.") + self.tau = tau self.weights = weights self.exposure = exposure - # def nloglikeobs(self, params): a = (self.exog @ params) * self.tau b = (1 - self.exog @ params) * self.tau - llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params else: start_params = 0.1 * np.ones(self.nparams) - return super(Weighted_BetaBinom_fixdispersion, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization with start_params = {start_params}.") + + start = time.time() + + result = super(Weighted_BetaBinom_fixdispersion, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + disp=False, + skip_hessian=True, + callback=None, + full_output=True, + retall=False, + **kwds ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + + logger.info(f"Finished Weighted_BetaBinom_fixdispersion optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + + return result + class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel): def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds): super(Weighted_BetaBinom_fixdispersion_mix, self).__init__(endog, exog, **kwds) + + logger.info(f"Initializing Weighted_BetaBinom_fixdispersion_mix model for endog.shape = {endog.shape}.") + self.tau = tau self.weights = weights self.exposure = exposure self.tumor_prop = tumor_prop - # def nloglikeobs(self, params): a = ( self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop) ) * self.tau + b = ( (1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop) ) * self.tau - llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params else: start_params = 0.1 * np.ones(self.nparams) - return super(Weighted_BetaBinom_fixdispersion_mix, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds + logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization with start_params = {start_params}.") + + start = time.time() + + result = super(Weighted_BetaBinom_fixdispersion_mix, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + disp=False, + skip_hessian=True, + callback=None, + full_output=True, + retall=False, + **kwds ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + + logger.info(f"Finished Weighted_BetaBinom_fixdispersion_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + + return result +# DEPRECATE class BAF_Binom(GenericLikelihoodModel): """ Binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. From a3a569dea7e0a3d91d6a641225b4af5f3bebc592 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:28:03 -0400 Subject: [PATCH 017/125] set error on MKL thread setting in calicost_supervised. --- src/calicost/calicost_supervised.py | 52 +++++++++++++---------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py index a881fff..6029d31 100644 --- a/src/calicost/calicost_supervised.py +++ b/src/calicost/calicost_supervised.py @@ -1,46 +1,42 @@ +import copy +import functools +import logging +import subprocess import sys +from pathlib import Path + +import anndata +import matplotlib.patches as mpatches +import mkl import numpy as np -import scipy import pandas as pd -from pathlib import Path -from sklearn.metrics import adjusted_rand_score -from sklearn.cluster import KMeans import scanpy as sc -import anndata -import logging +import scipy +import seaborn +from matplotlib import pyplot as plt +from matplotlib.lines import Line2D +from sklearn.cluster import KMeans +from sklearn.metrics import adjusted_rand_score -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) -logger = logging.getLogger() -import copy -from pathlib import Path -import functools -import subprocess from arg_parse import * +from find_integer_copynumber import * from hmm_NB_BB_phaseswitch import * -from utils_distribution_fitting import * -from utils_hmrf import * from hmrf import * +from parse_input import * from phasing import * +from utils_distribution_fitting import * +from utils_hmrf import * from utils_IO import * -from find_integer_copynumber import * -from parse_input import * from utils_plotting import * -from matplotlib import pyplot as plt -from matplotlib.lines import Line2D -import matplotlib.patches as mpatches -import seaborn - -plt.rcParams.update({"font.size": 14}) +# DEPRECATE +# mkl.set_num_threads(1) -import mkl +logger = logging.getLogger(__name__) -mkl.set_num_threads(1) +logger.error("MKL_NUM_THREADS set to unity here.") +plt.rcParams.update({"font.size": 14}) def main(configuration_file): try: From 7641681d609438decbedfde4a1e0dc20e7cdc250 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:39:11 -0400 Subject: [PATCH 018/125] fix == bug in setup.py --- setup.py | 57 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/setup.py b/setup.py index c447610..2ae07a8 100644 --- a/setup.py +++ b/setup.py @@ -1,33 +1,32 @@ import setuptools setuptools.setup( - name='calicost', - version='v1.0.0', - python_requires='>=3.8', - packages=['calicost'], - package_dir={'': 'src'}, - author='Cong Ma', - author_email='congma@princeton.edu', - description='Allele-specific CNAs and spatial cancer clone inference', - long_description='CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data', - url='https://github.com/raphael-group/CalicoST', - install_requires=[ - 'numpy=1.24.4', - 'scipy=1.11.3', - 'pandas=2.1.1', - 'scikit-learn=1.3.2', - 'scanpy=1.9.6', - 'anndata=0.10.3', - 'numba=0.60.0', - 'tqdm=4.66.1', - 'statsmodels=0.14.0', - 'networkx=3.2.1', - 'matplotlib=3.7.3', - 'seaborn=0.12.2', - 'pysam=0.22.1', - 'ete3=3.1.3', - 'ipykernel' - ], - include_package_data=True + name="calicost", + version="v1.0.0", + python_requires=">=3.8", + packages=["calicost"], + package_dir={"": "src"}, + author="Cong Ma", + author_email="congma@princeton.edu", + description="Allele-specific CNAs and spatial cancer clone inference", + long_description="CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data", + url="https://github.com/raphael-group/CalicoST", + install_requires=[ + "numpy==1.24.4", + "scipy==1.11.3", + "pandas==2.1.1", + "scikit-learn==1.3.2", + "scanpy==1.9.6", + "anndata==0.10.3", + "numba==0.60.0", + "tqdm==4.66.1", + "statsmodels==0.14.0", + "networkx==3.2.1", + "matplotlib==3.7.3", + "seaborn==0.12.2", + "pysam==0.22.1", + "ete3==3.1.3", + "ipykernel", + ], + include_package_data=True, ) - From a08f2e7396979986dd38513be7af1ca6ea1cc523 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:44:00 -0400 Subject: [PATCH 019/125] fix errors with configuration of inner and outer loops --- src/calicost/arg_parse.py | 5 ++++- src/calicost/calicost_main.py | 10 +++++----- src/calicost/parse_input.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py index 8bf796a..9acefb9 100644 --- a/src/calicost/arg_parse.py +++ b/src/calicost/arg_parse.py @@ -41,7 +41,8 @@ def load_default_config(): "min_avgumi_per_clone": 10, "maxspots_pooling": 7, "tumorprop_threshold": 0.5, - "max_iter_outer": 20, + "max_iter_outer_initial" : 20, + "max_iter_outer": 10, "nodepotential": "weighted_sum", # max or weighted_sum "initialization_method": "rectangle", # rectangle or datadrive "num_hmrf_initialization_start": 0, @@ -96,6 +97,7 @@ def load_default_config(): "min_avgumi_per_clone": "int", "maxspots_pooling": "int", "tumorprop_threshold": "float", + "max_iter_outer_initial" : "int", "max_iter_outer": "int", "nodepotential": "str", "initialization_method": "str", @@ -155,6 +157,7 @@ def load_default_config(): "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold", + "max_iter_outer_initial", "max_iter_outer", "nodepotential", "initialization_method", diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 10aee62..22ce18e 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -175,7 +175,7 @@ def main(configuration_file): smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, - max_iter_outer=config["max_iter_outer"], + max_iter_outer=config["max_iter_outer_initial"], nodepotential=config["nodepotential"], hmmclass=hmm_nophasing_v2, params="sp", @@ -207,7 +207,7 @@ def main(configuration_file): smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, - max_iter_outer=config["max_iter_outer"], + max_iter_outer=config["max_iter_outer_initial"], nodepotential=config["nodepotential"], hmmclass=hmm_nophasing_v2, params="sp", @@ -542,7 +542,7 @@ def main(configuration_file): smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], sample_ids=copy_slice_sample_ids, - max_iter_outer=10, + max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], hmmclass=hmm_nophasing_v2, params="smp", @@ -572,7 +572,7 @@ def main(configuration_file): smooth_mat=smooth_mat[np.ix_(idx_spots, idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots, idx_spots)], sample_ids=copy_slice_sample_ids, - max_iter_outer=10, + max_iter_outer=config["max_iter_outer"], nodepotential=config["nodepotential"], hmmclass=hmm_nophasing_v2, params="smp", @@ -1411,4 +1411,4 @@ def main(configuration_file): ) args = parser.parse_args() - main(args.configfile) \ No newline at end of file + main(args.configfile) diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py index 2585923..f84ef53 100644 --- a/src/calicost/parse_input.py +++ b/src/calicost/parse_input.py @@ -185,7 +185,7 @@ def parse_visium(config): config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], - 30, + config["max_iter"], 1e-3, threshold=config["tumorprop_threshold"], ) From a51d402ded2e905446dd48b902b7445276e04dcc Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:46:18 -0400 Subject: [PATCH 020/125] fix issues around string and bracks from additional logging. --- src/calicost/calicost_main.py | 4 ++-- src/calicost/hmm_NB_BB_phaseswitch.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 22ce18e..b8ed786 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -128,7 +128,7 @@ def main(configuration_file): coords, config["n_clones"], random_state=r_hmrf_initialization ) else: - logger.info(f"Initializing clones based on tumor proportion: {config["tumorprop_file"]}") + logger.info(f"Initializing clones based on tumor proportion: {config['tumorprop_file']}") initial_clone_index = rectangle_initialize_initial_clone_mix( coords, @@ -274,7 +274,7 @@ def main(configuration_file): ) logger.info(f"BAF clone merging after comparing similarity: {merging_groups}") - logger.info(f"Merging similar initial clones based on min. spot threshold of {config["min_spots_per_clone"]}.") + logger.info(f"Merging similar initial clones based on min. spot threshold of {config['min_spots_per_clone']}.") if config["tumorprop_file"] is None: merging_groups, merged_res = merge_by_minspots( diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 07995ed..89a0e53 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -549,7 +549,9 @@ def run_baum_welch_nb_bb( np.mean(np.abs(new_log_mu - log_mu)), np.mean(np.abs(new_p_binom - p_binom)), ) - logger.info((np.hstack([new_log_mu, new_p_binom])) + + logger.info(np.hstack([new_log_mu, new_p_binom])) + if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol @@ -1578,4 +1580,4 @@ def combine_similar_states_across_clones( # merged_res["total_llf"] = np.NAN # merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) # merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) -# return merging_groups, merged_res \ No newline at end of file +# return merging_groups, merged_res From 849c20f662235cebe02e1591ae3acae48b0bc73b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 13:54:52 -0400 Subject: [PATCH 021/125] fix errors around disp=False (statsmodels) and inconsistent conda/pip environment --- environment.yml | 4 +++- src/calicost/utils_distribution_fitting.py | 6 ------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/environment.yml b/environment.yml index 01058c9..522e4be 100644 --- a/environment.yml +++ b/environment.yml @@ -1,10 +1,12 @@ -name: calicost_env +name: calicost channels: - conda-forge - bioconda - defaults dependencies: - python==3.10 + - numpy==1.24.4 + - scipy==1.11.3 - samtools==1.18 - bcftools==1.18 - cellsnp-lite diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 858957c..424cb52 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -93,7 +93,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start_params=start_params, maxiter=maxiter, maxfun=maxfun, - disp=False, skip_hessian=True, callback=None, full_output=True, @@ -147,7 +146,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start_params=start_params, maxiter=maxiter, maxfun=maxfun, - disp=False, skip_hessian=True, callback=None, full_output=True, @@ -215,7 +213,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start_params=start_params, maxiter=maxiter, maxfun=maxfun, - disp=False, skip_hessian=True, callback=None, full_output=True, @@ -271,7 +268,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start_params=start_params, maxiter=maxiter, maxfun=maxfun, - disp=False, skip_hessian=True, callback=None, full_output=True, @@ -318,7 +314,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start_params=start_params, maxiter=maxiter, maxfun=maxfun, - disp=False, skip_hessian=True, callback=None, full_output=True, @@ -371,7 +366,6 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start_params=start_params, maxiter=maxiter, maxfun=maxfun, - disp=False, skip_hessian=True, callback=None, full_output=True, From ea22b720f501b187a10a350095753100703b7d78 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 14:00:21 -0400 Subject: [PATCH 022/125] fix issue with multiple loggers. --- src/calicost/arg_parse.py | 5 +++-- src/calicost/calicost_main.py | 6 +++--- src/calicost/parse_input.py | 5 +++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py index 9acefb9..5c9992d 100644 --- a/src/calicost/arg_parse.py +++ b/src/calicost/arg_parse.py @@ -3,13 +3,14 @@ import scipy import pandas as pd import logging - +""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) -logger = logging.getLogger() +""" +logger = logging.getLogger(__name__) def load_default_config(): diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index b8ed786..a166c89 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -51,15 +51,15 @@ logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) -fhandler = logging.FileHandler('calicost.log', mode="w") +# fhandler = logging.FileHandler('calicost.log', mode="w") formatter = logging.Formatter("%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s") handler.setFormatter(formatter) -fhandler.setFormatter(formatter) +# fhandler.setFormatter(formatter) logger.addHandler(handler) -logger.addHandler(fhandler) +# logger.addHandler(fhandler) def main(configuration_file): start = datetime.datetime.now() diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py index f84ef53..49221c5 100644 --- a/src/calicost/parse_input.py +++ b/src/calicost/parse_input.py @@ -7,13 +7,14 @@ import scanpy as sc import anndata import logging - +""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) -logger = logging.getLogger() +""" +logger = logging.getLogger(__name__) import copy from pathlib import Path import functools From b6f94b10d12d0ef98acaf2dded85f07b114ac8a1 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 12 Aug 2024 14:51:34 -0400 Subject: [PATCH 023/125] fix prefix bug --- src/calicost/calicost_main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index a166c89..17c20ea 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -116,7 +116,7 @@ def main(configuration_file): for r_hmrf_initialization in range( config["num_hmrf_initialization_start"], config["num_hmrf_initialization_end"] ): - logger.info(f"Processing HMRF random realization {num_hmrf_initialization_start:d}") + logger.info(f"Processing HMRF random realization {r_hmrf_initialization}") outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}" outdir = Path(outdir) @@ -139,7 +139,9 @@ def main(configuration_file): ) # NB save clone initialization to npz file - file_name = Path(f"allspots_nstates{config['n_states']}_sp.npz") + prefix = "allspots" + + file_name = Path(f"{prefix}_nstates{config['n_states']}_sp.npz") file_path = outdir / file_name if not file_path.exists(): From 8eb653c4a09a84033185b7a6130e6902d0324669 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:33:23 -0400 Subject: [PATCH 024/125] revert imports & prefix checks. --- src/calicost/calicost_main.py | 175 ++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 83 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 17c20ea..0a0a5f2 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -1,59 +1,38 @@ -import copy -import functools -import logging -import subprocess import sys -import datetime -from pathlib import Path - -import anndata import numpy as np -import pandas as pd -import scanpy as sc import scipy -from sklearn.cluster import KMeans +import pandas as pd +from pathlib import Path from sklearn.metrics import adjusted_rand_score +from sklearn.cluster import KMeans +import scanpy as sc +import anndata +import logging +import copy +from pathlib import Path +import functools +import subprocess from calicost.arg_parse import * -from calicost.find_integer_copynumber import * from calicost.hmm_NB_BB_phaseswitch import * -from calicost.hmrf import * -from calicost.parse_input import * -from calicost.phasing import * from calicost.utils_distribution_fitting import * from calicost.utils_hmrf import * +from calicost.hmrf import * +from calicost.phasing import * from calicost.utils_IO import * +from calicost.find_integer_copynumber import * +from calicost.parse_input import * from calicost.utils_plotting import * -""" -from calicost.hmm_NB_BB_nophasing_v2 import hmm_nophasing_v2 -from calicost.arg_parse import run_parse_n_load, genesnp_to_bininfo -from calicost.find_integer_copynumber import (hill_climbing_integer_copynumber_fixdiploid, - hill_climbing_integer_copynumber_oneclone) -from calicost.hmm_NB_BB_phaseswitch import (combine_similar_states_across_clones, - similarity_components_rdrbaf_neymanpearson) -from calicost.hmrf import (aggr_hmrf_reassignment, aggr_hmrfmix_reassignment, - hmrf_concatenate_pipeline, hmrf_reassignment_posterior, - hmrfmix_concatenate_pipeline, hmrfmix_reassignment_posterior, - merge_by_minspots) -from calicost.phasing import pipeline_baum_welch -from calicost.utils_hmrf import (load_hmrf_last_iteration, rectangle_initialize_initial_clone, - rectangle_initialize_initial_clone_mix, reorder_results) -from calicost.utils_IO import bin_selection_basedon_normal, expand_df_cnv, filter_de_genes_tri -from calicost.utils_plotting import (argparse, merge_pseudobulk_by_index, - merge_pseudobulk_by_index_mix, plot_acn_from_df, - plot_acn_from_df_anotherscheme, plot_clones_in_space, - plot_individual_spots_in_space, plot_rdr_baf, plt, - read_configuration_file, read_joint_configuration_file) -""" - logger = logging.getLogger("calicost") logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) # fhandler = logging.FileHandler('calicost.log', mode="w") -formatter = logging.Formatter("%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s") +formatter = logging.Formatter( + "%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s" +) handler.setFormatter(formatter) # fhandler.setFormatter(formatter) @@ -61,6 +40,7 @@ logger.addHandler(handler) # logger.addHandler(fhandler) + def main(configuration_file): start = datetime.datetime.now() @@ -76,12 +56,12 @@ def main(configuration_file): # NB assuming the B-allele counts are calculated by the cellsnp-lite & Eagle pipeline. If assuming each spot contains # a mixture of normal/tumor cells, the tumor proportion path should be provided in the config file. - # - # NB load data: - # - If the data is loaded for the first time: infer phasing using phase-switch HMM + # + # NB load data: + # - If the data is loaded for the first time: infer phasing using phase-switch HMM # (hmm_NB_BB_phaseswitch.py & phasing.py) with output initial_phase.npz, matrices # in /parsed_inputs - # + # # - If the data is already loaded: load the matrices from parsed_inputs folder logger.info(f"Running parse and load.") @@ -128,7 +108,9 @@ def main(configuration_file): coords, config["n_clones"], random_state=r_hmrf_initialization ) else: - logger.info(f"Initializing clones based on tumor proportion: {config['tumorprop_file']}") + logger.info( + f"Initializing clones based on tumor proportion: {config['tumorprop_file']}" + ) initial_clone_index = rectangle_initialize_initial_clone_mix( coords, @@ -140,14 +122,14 @@ def main(configuration_file): # NB save clone initialization to npz file prefix = "allspots" - + file_name = Path(f"{prefix}_nstates{config['n_states']}_sp.npz") file_path = outdir / file_name if not file_path.exists(): logger.info(f"Creating output dir: {str(outdir)}") - # TODO exist_ok + # TODO exist_ok outdir.mkdir(parents=True, exist_ok=True) initial_assignment = np.zeros(single_X.shape[2], dtype=int) @@ -155,10 +137,13 @@ def main(configuration_file): for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - np.savez(str(file_path), **{"num_iterations": 0, "round-1_assignment": initial_assignment}) + np.savez( + str(file_path), + **{"num_iterations": 0, "round-1_assignment": initial_assignment}, + ) # ---- HMRF + HMM ---- - # + # # NB stores the results of each HMRF iteration in a .npz @ ./outdir/prefix_nstates{config['n_states']}_sp.npz # if a specific iteration is already computed, hmrf will load the results directly from the file. if config["tumorprop_file"] is None: @@ -226,7 +211,9 @@ def main(configuration_file): tumorprop_threshold=config["tumorprop_threshold"], ) - logger.info("Loading last HMRF iteration & merging clones based on BAF profile similarity threshold.") + logger.info( + "Loading last HMRF iteration & merging clones based on BAF profile similarity threshold." + ) n_obs = single_X.shape[0] res = load_hmrf_last_iteration( @@ -257,11 +244,13 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], ) tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) - + logger.info("Merged pseudo-bulk based on clone index.") # NB ratio == P(clone A counts | BAF parameters for clone A) / P(clone A counts | BAF parameters for clone B) - logger.info("Merging similar initial clones based on Neyman-Pearson Likelihood ratio.") + logger.info( + "Merging similar initial clones based on Neyman-Pearson Likelihood ratio." + ) merging_groups, merged_res = similarity_components_rdrbaf_neymanpearson( X, @@ -276,7 +265,9 @@ def main(configuration_file): ) logger.info(f"BAF clone merging after comparing similarity: {merging_groups}") - logger.info(f"Merging similar initial clones based on min. spot threshold of {config['min_spots_per_clone']}.") + logger.info( + f"Merging similar initial clones based on min. spot threshold of {config['min_spots_per_clone']}." + ) if config["tumorprop_file"] is None: merging_groups, merged_res = merge_by_minspots( @@ -297,7 +288,9 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], ) - logger.info(f"BAF clone merging after requiring minimum # spots: {merging_groups}") + logger.info( + f"BAF clone merging after requiring minimum # spots: {merging_groups}" + ) n_baf_clones = len(merging_groups) @@ -305,9 +298,7 @@ def main(configuration_file): logger.info(f"Writing merged initial clones to {file_path}") - np.savez( - file_path, **merged_res - ) + np.savez(file_path, **merged_res) # NB load merged results n_obs = single_X.shape[0] @@ -321,7 +312,7 @@ def main(configuration_file): merged_baf_assignment = copy.copy(merged_res["new_assignment"]) n_baf_clones = len(np.unique(merged_baf_assignment)) - # TODO comment. + # TODO comment. pred = np.argmax(merged_res["log_gamma"], axis=0) pred = np.array( [pred[(c * n_obs) : (c * n_obs + n_obs)] for c in range(n_baf_clones)] @@ -338,12 +329,14 @@ def main(configuration_file): ] ) - logger.info("Preparing refinement of initial, merged clones using BAF & RDR ****") - + logger.info( + "Preparing refinement of initial, merged clones using BAF & RDR ****" + ) + if not config["bafonly"]: # NB this block only used when assuming each spot is pure normal or pure tumor, # and if we don't know which spots are normal spots. - # + # # NB select normal spots logger.info("Identifying normal spots.") @@ -404,7 +397,9 @@ def main(configuration_file): # pseudobulk has large |BAF - 0.5| index_normal = np.where(normal_candidate)[0] - logger.info("Filtering genomic bins for allele-specific expression based on normal spots.") + logger.info( + "Filtering genomic bins for allele-specific expression based on normal spots." + ) ( lengths, @@ -423,7 +418,7 @@ def main(configuration_file): index_normal, config["geneticmap_file"], ) - + assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0] df_bininfo = genesnp_to_bininfo(df_gene_snp) @@ -482,10 +477,12 @@ def main(configuration_file): ), ) - logger.info(f"**** Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR ****") + logger.info( + f"**** Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR ****" + ) for bafc in range(n_baf_clones): - logger.info(f"Refining BAF clone {bafc}.") + logger.info(f"Refining BAF clone {bafc}.") prefix = f"clone{bafc}" idx_spots = np.where(merged_baf_assignment == bafc)[0] @@ -511,7 +508,9 @@ def main(configuration_file): ) # NB write the initialization to .npz @ ./outdir/prefix_nstates{config['n_states']}_smp.npz - file_path = Path(f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz") + file_path = Path( + f"{outdir}/{prefix}_nstates{config['n_states']}_smp.npz" + ) if not file_path.exists(): initial_assignment = np.zeros(len(idx_spots), dtype=int) @@ -519,13 +518,14 @@ def main(configuration_file): for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c - allres = { - "barcodes": barcodes[idx_spots], - "num_iterations": 0, - "round-1_assignment": initial_assignment, - } - - np.savez(str(file_path), **allres) + np.savez( + str(file_path), + **{ + "barcodes": barcodes[idx_spots], + "num_iterations": 0, + "round-1_assignment": initial_assignment, + }, + ) # HMRF + HMM with RDR copy_slice_sample_ids = copy.copy(sample_ids[idx_spots]) @@ -666,7 +666,9 @@ def main(configuration_file): ) tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) - logger.info(f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio.") + logger.info( + f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio." + ) merging_groups, merged_res = ( similarity_components_rdrbaf_neymanpearson( @@ -682,8 +684,10 @@ def main(configuration_file): ) ) - logger.info(f"BAF+RDR clone {bafc}: merging_groups={merging_groups}") - + logger.info( + f"BAF+RDR clone {bafc}: merging_groups={merging_groups}" + ) + if config["tumorprop_file"] is None: merging_groups, merged_res = merge_by_minspots( merged_res["new_assignment"], @@ -704,7 +708,7 @@ def main(configuration_file): single_tumor_prop=single_tumor_prop[idx_spots], threshold=config["tumorprop_threshold"], ) - + # TODO what is merging_groups logger.info( f"BAF+RDR clone {bafc} merging after requiring minimum # spots: {merging_groups}" @@ -739,8 +743,10 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], ) ) - - logger.info(f"Running Baum-Welch with refined & merged BAF+RDR clones.") + + logger.info( + f"Running Baum-Welch with refined & merged BAF+RDR clones." + ) merged_res = pipeline_baum_welch( None, @@ -808,7 +814,7 @@ def main(configuration_file): for c in range(n_merged_clones) ] ).T - + # NB add to res_combine if len(res_combine) == 1: res_combine.update( @@ -987,14 +993,16 @@ def main(configuration_file): res_combine, posterior = reorder_results( res_combine, posterior, single_tumor_prop ) - - logger.info(f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz") + + logger.info( + f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz" + ) np.savez( f"{outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz", **res_combine, ) - + logger.info(f"Writing {outdir}/posterior_clone_probability.npy") np.save(f"{outdir}/posterior_clone_probability.npy", posterior) @@ -1102,7 +1110,7 @@ def main(configuration_file): logger.info( f"max med ploidy = {max_medploidy}, clone {s}, integer copy inference loss = {_}" ) - + allele_specific_copy.append( pd.DataFrame( best_integer_copies[ @@ -1121,7 +1129,7 @@ def main(configuration_file): columns=np.arange(n_obs), ) ) - + state_cnv.append( pd.DataFrame( res_combine["new_log_mu"][:, s].reshape(-1, 1), @@ -1402,6 +1410,7 @@ def main(configuration_file): logging.info(f"Complete in {runtime} [seconds].") + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( From b657a9efa3c503c4fc66f12ad75c0b2cbaa3dd6b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:40:33 -0400 Subject: [PATCH 025/125] revert imports in calicost supervised. --- src/calicost/calicost_supervised.py | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py index 6029d31..082b638 100644 --- a/src/calicost/calicost_supervised.py +++ b/src/calicost/calicost_supervised.py @@ -1,3 +1,36 @@ +import sys +import numpy as np +import scipy +import pandas as pd +from pathlib import Path +from sklearn.metrics import adjusted_rand_score +from sklearn.cluster import KMeans +import scanpy as sc +import anndata +import logging + +import copy +from pathlib import Path +import functools +import subprocess +from arg_parse import * +from hmm_NB_BB_phaseswitch import * +from utils_distribution_fitting import * +from utils_hmrf import * +from hmrf import * +from phasing import * +from utils_IO import * +from find_integer_copynumber import * +from parse_input import * +from utils_plotting import * + +from matplotlib import pyplot as plt +from matplotlib.lines import Line2D +import matplotlib.patches as mpatches +import seaborn + +import mkl + import copy import functools import logging From e340a40d6e28a6b94292de99bfa52ebce1b6211e Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:42:50 -0400 Subject: [PATCH 026/125] revert import for hmm_nophasing --- src/calicost/hmm_NB_BB_nophasing.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index 32d94f3..b546989 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -1,19 +1,18 @@ -import copy import logging - -import networkx as nx import numpy as np -import scipy.special -import statsmodels.api as sm from numba import njit -from scipy.optimize import Bounds, minimize -from scipy.stats import multivariate_normal, norm, poisson +from scipy.stats import norm, multivariate_normal, poisson +import scipy.special +from scipy.optimize import minimize +from scipy.optimize import Bounds from sklearn.mixture import GaussianMixture -from statsmodels.base.model import GenericLikelihoodModel from tqdm import trange - +import statsmodels.api as sm +from statsmodels.base.model import GenericLikelihoodModel +import copy from calicost.utils_distribution_fitting import * from calicost.utils_hmm import * +import networkx as nx logger = logging.getLogger(__name__) @@ -518,4 +517,4 @@ def run_baum_welch_nb_bb( new_log_startprob, new_log_transmat, log_gamma, - ) \ No newline at end of file + ) From 25b35d1281a887abb67b9181f7fdcc476a175a39 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:43:20 -0400 Subject: [PATCH 027/125] revert imports for hmm_NB_BB_nophasing_v2 --- src/calicost/hmm_NB_BB_nophasing_v2.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index a4408f6..61bc562 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -1,19 +1,18 @@ -import copy import logging - -import networkx as nx import numpy as np -import scipy.special -import statsmodels.api as sm from numba import njit -from scipy.optimize import Bounds, minimize -from scipy.stats import multivariate_normal, norm, poisson +from scipy.stats import norm, multivariate_normal, poisson +import scipy.special +from scipy.optimize import minimize +from scipy.optimize import Bounds from sklearn.mixture import GaussianMixture -from statsmodels.base.model import GenericLikelihoodModel from tqdm import trange - +import statsmodels.api as sm +from statsmodels.base.model import GenericLikelihoodModel +import copy from calicost.utils_distribution_fitting import * from calicost.utils_hmm import * +import networkx as nx logger = logging.getLogger(__name__) From bdd50568046cb1bf33bce3c2eb65b33c1ba39aa1 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:45:04 -0400 Subject: [PATCH 028/125] revert imports for hmm phaseswitch --- src/calicost/hmm_NB_BB_phaseswitch.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 89a0e53..f1aefc9 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -1,21 +1,20 @@ -import copy import logging - -import networkx as nx import numpy as np -import scipy.special -import statsmodels.api as sm from numba import njit -from scipy.optimize import Bounds, minimize -from scipy.stats import multivariate_normal, norm, poisson +from scipy.stats import norm, multivariate_normal, poisson +import scipy.special +from scipy.optimize import minimize +from scipy.optimize import Bounds from sklearn.mixture import GaussianMixture -from statsmodels.base.model import GenericLikelihoodModel from tqdm import trange - +import statsmodels.api as sm +from statsmodels.base.model import GenericLikelihoodModel +import copy +from calicost.utils_hmm import * +from calicost.utils_distribution_fitting import * from calicost.hmm_NB_BB_nophasing import * from calicost.hmm_NB_BB_nophasing_v2 import * -from calicost.utils_distribution_fitting import * -from calicost.utils_hmm import * +import networkx as nx logger = logging.getLogger(__name__) From 6cc7e6b05059736751a23b6b266d6fe56179c394 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:48:11 -0400 Subject: [PATCH 029/125] revert imports for hmrf --- src/calicost/hmrf.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 322b61e..205e2bd 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1,26 +1,25 @@ -import copy import logging -import warnings -from pathlib import Path -# from turtle import reset - -# import networkx as nx +from turtle import reset import numpy as np import pandas as pd -import scipy.sparse -import scipy.special from numba import njit +import scipy.special +import scipy.sparse +from sklearn.mixture import GaussianMixture from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score, silhouette_score -from sklearn.mixture import GaussianMixture from sklearn.neighbors import kneighbors_graph -from statsmodels.tools.sm_exceptions import ValueWarning +import networkx as nx from tqdm import trange - +import copy +from pathlib import Path from calicost.hmm_NB_BB_phaseswitch import * from calicost.utils_distribution_fitting import * -from calicost.utils_hmrf import * from calicost.utils_IO import * +from calicost.utils_hmrf import * + +import warnings +from statsmodels.tools.sm_exceptions import ValueWarning logger = logging.getLogger(__name__) From 3d7f0b7ab49864055158837b1153636fb949639b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:48:21 -0400 Subject: [PATCH 030/125] revert imports for utils IO. --- src/calicost/utils_IO.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index bda22d8..d570a84 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -1,30 +1,23 @@ -import copy -import logging import sys -from pathlib import Path - -import anndata import numpy as np -import pandas as pd -import scanpy as sc import scipy -from sklearn.cluster import KMeans -from sklearn.kernel_ridge import KernelRidge +import copy +import pandas as pd +from pathlib import Path from sklearn.metrics import adjusted_rand_score from sklearn.neighbors import LocalOutlierFactor -""" -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) -""" -logger = logging.getLogger(__name__) +from sklearn.kernel_ridge import KernelRidge +from sklearn.cluster import KMeans +import scanpy as sc +import anndata +import logging +from calicost.utils_phase_switch import * +from calicost.utils_distribution_fitting import * import subprocess -from calicost.utils_distribution_fitting import * -from calicost.utils_phase_switch import * + +logger = logging.getLogger(__name__) def load_data( From 3b34e64edeec6df0b6c59f354343e8c5ca194687 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:51:23 -0400 Subject: [PATCH 031/125] revert imports for utils hmm --- src/calicost/utils_hmm.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 9145ae5..65153de 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -1,12 +1,9 @@ -import copy -import logging - import numpy as np -import scipy.special from numba import njit -from sklearn.mixture import GaussianMixture +import copy +import scipy.special from tqdm import trange - +from sklearn.mixture import GaussianMixture from calicost.utils_distribution_fitting import * logger = logging.getLogger(__name__) @@ -2168,4 +2165,4 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( logger.info("Computed emission params for Beta Binomial Mix (no phasing, unique).") - return new_p_binom, new_taus \ No newline at end of file + return new_p_binom, new_taus From 627d6b9e79faad92029a901fad136ac35bc11967 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:51:35 -0400 Subject: [PATCH 032/125] revert imports for distribution fitting. --- src/calicost/utils_distribution_fitting.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 424cb52..6f10938 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -1,21 +1,21 @@ import functools import inspect import logging -import os -import time import numpy as np import scipy +from scipy import linalg, special +from scipy.special import logsumexp, loggamma import scipy.integrate import scipy.stats -import statsmodels -import statsmodels.api as sm from numba import jit, njit -from scipy import linalg, special -from scipy.special import loggamma, logsumexp from sklearn import cluster from sklearn.utils import check_random_state +import statsmodels +import statsmodels.api as sm from statsmodels.base.model import GenericLikelihoodModel +import os + logger = logging.getLogger(__name__) From 06a368b1d095b8ae6f3701d70d7b6075bbca09bd Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:52:21 -0400 Subject: [PATCH 033/125] add utils hmm njit import --- src/calicost/utils_hmm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 65153de..dfcba58 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -2,6 +2,7 @@ from numba import njit import copy import scipy.special +from numba import njit from tqdm import trange from sklearn.mixture import GaussianMixture from calicost.utils_distribution_fitting import * From d34362063c4dcf487bbcd71f6f71f4d6852510ef Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 10:55:12 -0400 Subject: [PATCH 034/125] fix calicost supervised imports. --- src/calicost/calicost_supervised.py | 31 ----------------------------- 1 file changed, 31 deletions(-) diff --git a/src/calicost/calicost_supervised.py b/src/calicost/calicost_supervised.py index 082b638..b74c5b6 100644 --- a/src/calicost/calicost_supervised.py +++ b/src/calicost/calicost_supervised.py @@ -31,37 +31,6 @@ import mkl -import copy -import functools -import logging -import subprocess -import sys -from pathlib import Path - -import anndata -import matplotlib.patches as mpatches -import mkl -import numpy as np -import pandas as pd -import scanpy as sc -import scipy -import seaborn -from matplotlib import pyplot as plt -from matplotlib.lines import Line2D -from sklearn.cluster import KMeans -from sklearn.metrics import adjusted_rand_score - -from arg_parse import * -from find_integer_copynumber import * -from hmm_NB_BB_phaseswitch import * -from hmrf import * -from parse_input import * -from phasing import * -from utils_distribution_fitting import * -from utils_hmrf import * -from utils_IO import * -from utils_plotting import * - # DEPRECATE # mkl.set_num_threads(1) From 91dfaa2f6f286e34ef9c1a432f47c92b0e243b01 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 11:17:07 -0400 Subject: [PATCH 035/125] fix runtime logging. --- src/calicost/calicost_main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 0a0a5f2..75734e4 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -13,6 +13,7 @@ from pathlib import Path import functools import subprocess +import datetime from calicost.arg_parse import * from calicost.hmm_NB_BB_phaseswitch import * from calicost.utils_distribution_fitting import * @@ -1253,11 +1254,16 @@ def main(configuration_file): df_clone_label["tumor_proportion"] = single_tumor_prop logger.info(f"Writing clone labels to {outdir}/clone_labels.tsv") - + df_clone_label.to_csv( f"{outdir}/clone_labels.tsv", header=True, index=True, sep="\t" ) + end = datetime.datetime.now() + runtime = end - start + + logging.info(f"Complete in {runtime} [seconds].") + Path(f"{outdir}/plots").mkdir(parents=True, exist_ok=True) # NB plot RDR and BAF. @@ -1405,11 +1411,6 @@ def main(configuration_file): bbox_inches="tight", ) - end = datetime.datetime.now() - runtime = end - start - - logging.info(f"Complete in {runtime} [seconds].") - if __name__ == "__main__": parser = argparse.ArgumentParser() From 2c7c9b5c37b95fab293ce5c7124caa3e93e1e441 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 11:17:36 -0400 Subject: [PATCH 036/125] import bug --- src/calicost/utils_distribution_fitting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 6f10938..191b22d 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -4,6 +4,7 @@ import numpy as np import scipy +import time from scipy import linalg, special from scipy.special import logsumexp, loggamma import scipy.integrate From 8303c62b5fd0549a54a470f3f65d794534f20179 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 11:26:14 -0400 Subject: [PATCH 037/125] fix spelling mistake --- src/calicost/hmm_NB_BB_nophasing_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 61bc562..3d9bdf8 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -149,7 +149,7 @@ def compute_emission_probability_nb_betabinom_mix( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ - logger.info("Computing emission probability for *mixed* negative binomial & beta binommial.") + logger.info("Computing emission probability for *mixed* negative binomial & beta binomial.") n_obs = X.shape[0] n_comp = X.shape[1] @@ -210,7 +210,7 @@ def compute_emission_probability_nb_betabinom_mix( mix_p_B * taus[i, s], ) - logger.info("Computed emission probability for *mixed* negative binomial & beta binommial.") + logger.info("Computed emission probability for *mixed* negative binomial & beta binomial.") return log_emission_rdr, log_emission_baf From 9ed0df57c577d0ed5519518f93c8cde9938e856a Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 13:04:54 -0400 Subject: [PATCH 038/125] log comparative likelihoods. --- src/calicost/utils_hmm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index dfcba58..5c4fa29 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -470,7 +470,6 @@ def update_emission_params_nb_sitewise_uniqvalues( n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) - # initialization new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None @@ -614,7 +613,15 @@ def update_emission_params_nb_sitewise_uniqvalues( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + logger.info(f"") + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) From 96d6fa351f8fc174ef23cf9a8dd814a2450d0706 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 15:30:36 -0400 Subject: [PATCH 039/125] fix --- src/calicost/utils_distribution_fitting.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 191b22d..b343868 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -85,7 +85,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - logger.info(f"Starting Weighted_NegativeBinomial optimization with start_params = {start_params}.") + logger.info(f"Starting Weighted_NegativeBinomial optimization @ {start_params}.") start = time.time() @@ -139,7 +139,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - logger.info(f"Starting Weighted_NegativeBinomial_mix optimization with start_params = {start_params}.") + logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ {start_params}.") start = time.time() @@ -206,7 +206,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) - logger.info(f"Starting Weighted_BetaBinomial optimization with start_params = {start_params}.") + logger.info(f"Starting Weighted_BetaBinomial optimization @ {start_params}.") start = time.time() @@ -261,7 +261,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) - logger.info(f"Starting Weighted_BetaBinom_mix optimization with start_params = {start_params}.") + logger.info(f"Starting Weighted_BetaBinom_mix optimization with @ {start_params}.") start = time.time() @@ -307,7 +307,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): else: start_params = 0.1 * np.ones(self.nparams) - logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization with start_params = {start_params}.") + logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ {start_params}.") start = time.time() @@ -359,7 +359,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): else: start_params = 0.1 * np.ones(self.nparams) - logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization with start_params = {start_params}.") + logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ {start_params}.") start = time.time() @@ -401,29 +401,27 @@ class BAF_Binom(GenericLikelihoodModel): exposure : array, (n_samples,) Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. """ - def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds): super(BAF_Binom, self).__init__(endog, exog, **kwds) + self.weights = weights self.exposure = exposure self.offset = offset self.scaling = scaling - # def nloglikeobs(self, params): linear_term = self.exog @ params p = self.scaling / (1 + np.exp(-linear_term + self.offset)) - llf = scipy.stats.binom.logpmf(self.endog, self.exposure, p) - neg_sum_llf = -llf.dot(self.weights) - return neg_sum_llf - # + return -scipy.stats.binom.logpmf(self.endog, self.exposure, p).dot(self.weights) + def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params else: start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams) + return super(BAF_Binom, self).fit( start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds ) From 9d3c141fceae9f65fef94c265d97c370f61fb795 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Thu, 15 Aug 2024 15:45:17 -0400 Subject: [PATCH 040/125] fix --- src/calicost/calicost_main.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 75734e4..9e52821 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -25,21 +25,20 @@ from calicost.parse_input import * from calicost.utils_plotting import * +# NB prevent wrap-around of log lines due to high precision printing. +np.set_printoptions(precision=6) + logger = logging.getLogger("calicost") logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) -# fhandler = logging.FileHandler('calicost.log', mode="w") - formatter = logging.Formatter( "%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s" ) handler.setFormatter(formatter) -# fhandler.setFormatter(formatter) logger.addHandler(handler) -# logger.addHandler(fhandler) def main(configuration_file): From 690fd305f6152e63c3c81ea80d67d2d8cba72774 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 09:55:05 -0400 Subject: [PATCH 041/125] improve runtime logging --- src/calicost/hmm_NB_BB_nophasing_v2.py | 8 ++++---- src/calicost/hmrf.py | 24 +++++++++++++++++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 3d9bdf8..fcd7c67 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -72,7 +72,7 @@ def compute_emission_probability_nb_betabinom( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ - logger.info("Computing emission probability for negative binomial & beta binomial.") + # logger.info("Computing emission probability for negative binomial & beta binomial.") n_obs = X.shape[0] n_comp = X.shape[1] @@ -104,7 +104,7 @@ def compute_emission_probability_nb_betabinom( ) ) - logger.info("Computed emission probability for negative binomial & beta binomial.") + # logger.info("Computed emission probability for negative binomial & beta binomial.") return log_emission_rdr, log_emission_baf @@ -149,7 +149,7 @@ def compute_emission_probability_nb_betabinom_mix( log_emission : array, shape (n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ - logger.info("Computing emission probability for *mixed* negative binomial & beta binomial.") + # logger.info("Computing emission probability for *mixed* negative binomial & beta binomial.") n_obs = X.shape[0] n_comp = X.shape[1] @@ -210,7 +210,7 @@ def compute_emission_probability_nb_betabinom_mix( mix_p_B * taus[i, s], ) - logger.info("Computed emission probability for *mixed* negative binomial & beta binomial.") + # logger.info("Computed emission probability for *mixed* negative binomial & beta binomial.") return log_emission_rdr, log_emission_baf diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 205e2bd..b47485f 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -59,7 +59,7 @@ def hmrf_reassignment_posterior( posterior = np.zeros((N, n_clones)) - logger.info("Computing hmrf_reassignment_posterior") + logger.info("Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") for i in trange(N, desc="hmrf_reassignment_posterior"): idx = smooth_mat[i, :].nonzero()[1] @@ -132,6 +132,9 @@ def hmrf_reassignment_posterior( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info("Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -157,7 +160,6 @@ def aggr_hmrf_reassignment( Choosing clones by Iterated Conditional Modes (Viterbi version): for which the emission probability of each spot is a single of HMM state sequence. Input format assumption: the RDR/BAF vectors are not shared across clones <- after clone refinement with RDR+BAF signals. - """ N = single_X.shape[2] n_obs = single_X.shape[0] @@ -168,9 +170,11 @@ def aggr_hmrf_reassignment( posterior = np.zeros((N, n_clones)) + logger.info("Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") + for i in trange(N, desc="aggr_hmrf_reassignment"): idx = smooth_mat[i, :].nonzero()[1] - # idx = np.append(idx, np.array([i])) + for c in range(n_clones): tmp_log_emission_rdr, tmp_log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( @@ -224,6 +228,9 @@ def aggr_hmrf_reassignment( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info("Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -256,6 +263,8 @@ def hmrf_reassignment_posterior_concatenate( posterior = np.zeros((N, n_clones)) + logger.info("Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") + for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"): idx = smooth_mat[i, :].nonzero()[1] tmp_log_emission_rdr, tmp_log_emission_baf = ( @@ -329,6 +338,9 @@ def hmrf_reassignment_posterior_concatenate( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info("Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -393,6 +405,9 @@ def aggr_hmrf_reassignment_concatenate( total_llf : float The HMRF objective, which is the sum of log likelihood under the optimal labels plus the sum of edge potentials. """ + + logger.info("Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") + N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = int(len(pred) / n_obs) @@ -458,6 +473,9 @@ def aggr_hmrf_reassignment_concatenate( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info("Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: From c11b2530b5e22bcb37c5bb67f115b99198912763 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 09:58:57 -0400 Subject: [PATCH 042/125] update utils hmm logging to show n spots. --- src/calicost/utils_hmm.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 5c4fa29..95be54e 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -464,12 +464,12 @@ def update_emission_params_nb_sitewise_uniqvalues( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ - logger.info("Computing emission params for Negative Binomial (sitewise, unique).") - n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) + logger.info("Computing emission params for Negative Binomial (sitewise, unique) with {n_spots} spots and {n_states} states.") + new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None @@ -631,7 +631,7 @@ def update_emission_params_nb_sitewise_uniqvalues( new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr - logger.info("Computed emission params for Negative Binomial (sitewise, unique).") + logger.info("Computed emission params for Negative Binomial (sitewise, unique) with {n_spots} spots and {n_states} states.") return new_log_mu, new_alphas @@ -661,11 +661,12 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ - logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique).") - n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) + + logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.") + # initialization new_log_mu = ( copy.copy(start_log_mu) @@ -841,7 +842,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr - logger.info("Computed emission params for Negative Binomial Mix (sitewise, unique).") + logger.info("Computed emission params for Negative Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.") return new_log_mu, new_alphas @@ -871,11 +872,12 @@ def update_emission_params_bb_sitewise_uniqvalues( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info("Computing emission params for Beta Binomial (sitewise, unique).") - n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) + + logger.info("Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") + # initialization new_p_binom = ( copy.copy(start_p_binom) @@ -1066,7 +1068,7 @@ def update_emission_params_bb_sitewise_uniqvalues( new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob - logger.info("Computed emission params for Beta Binomial (sitewise, unique).") + logger.info("Computed emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") return new_p_binom, new_taus @@ -1097,11 +1099,12 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique).") - n_spots = len(unique_values) n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) + + logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.") + # initialization new_p_binom = ( copy.copy(start_p_binom) @@ -1322,7 +1325,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob - logger.info("Computed emission params for Beta Binomial Mix (sitewise, unique).") + logger.info("Computed emission params for Beta Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.") return new_p_binom, new_taus From 0a3e3d081676844fdcc97c4f9a5927c76c5a3642 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 13:36:51 -0400 Subject: [PATCH 043/125] fix logging error from multiple args --- src/calicost/hmm_NB_BB_nophasing_v2.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index fcd7c67..fe13c5f 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -567,19 +567,18 @@ def run_baum_welch_nb_bb( # check convergence logger.info( - "EM convergence metrics (v2)", - np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), - np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), - np.mean(np.abs(new_log_mu - log_mu)), - np.mean(np.abs(new_p_binom - p_binom)), + f"EM convergence metrics (v2): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}" ) + logger.info(np.hstack([new_log_mu, new_p_binom])) + if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol ): break + log_startprob = new_log_startprob log_transmat = new_log_transmat log_mu = new_log_mu From 7118f8d83ef298a5a1ffb6cb7189b504e990e20d Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:06:46 -0400 Subject: [PATCH 044/125] fix typo in hmrf logging --- src/calicost/hmrf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index b47485f..1305f8e 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -709,7 +709,7 @@ def hmrf_pipeline( # clone assignmment if nodepotential == "max": - logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment.") + logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment.") new_assignment, single_llf, total_llf = aggr_hmrf_reassignment( single_X, @@ -726,7 +726,7 @@ def hmrf_pipeline( hmmclass=hmmclass, ) elif nodepotential == "weighted_sum": - logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior.") + logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior.") new_assignment, single_llf, total_llf = hmrf_reassignment_posterior( single_X, @@ -2060,7 +2060,7 @@ def hmrfmix_concatenate_pipeline( # NB HMRF clone assignmment if nodepotential == "max": - logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfix_reassignment_concatenate.") + logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate.") new_assignment, single_llf, total_llf = ( aggr_hmrfmix_reassignment_concatenate( @@ -2080,7 +2080,7 @@ def hmrfmix_concatenate_pipeline( ) ) elif nodepotential == "weighted_sum": - logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfix_reassignment_posterior_concatenate.") + logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate.") new_assignment, single_llf, total_llf = ( hmrfmix_reassignment_posterior_concatenate( From 41e0740f6619868a487e77934f612382bbb18f96 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:11:47 -0400 Subject: [PATCH 045/125] update logging strings. --- src/calicost/hmrf.py | 187 +++++++++++++++++++++++++++---------------- 1 file changed, 117 insertions(+), 70 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 1305f8e..cf1d68c 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -27,6 +27,7 @@ # Pure clone ############################################################ + def hmrf_reassignment_posterior( single_X, single_base_nb_mean, @@ -56,11 +57,13 @@ def hmrf_reassignment_posterior( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) # node potential new_assignment = copy.copy(prev_assignment) - + posterior = np.zeros((N, n_clones)) - logger.info("Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + f"Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) + for i in trange(N, desc="hmrf_reassignment_posterior"): idx = smooth_mat[i, :].nonzero()[1] for c in range(n_clones): @@ -133,8 +136,10 @@ def hmrf_reassignment_posterior( ) ) - logger.info("Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -167,11 +172,13 @@ def aggr_hmrf_reassignment( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - + posterior = np.zeros((N, n_clones)) - logger.info("Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + for i in trange(N, desc="aggr_hmrf_reassignment"): idx = smooth_mat[i, :].nonzero()[1] @@ -229,8 +236,10 @@ def aggr_hmrf_reassignment( ) ) - logger.info("Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -260,11 +269,13 @@ def hmrf_reassignment_posterior_concatenate( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - + posterior = np.zeros((N, n_clones)) - logger.info("Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"): idx = smooth_mat[i, :].nonzero()[1] tmp_log_emission_rdr, tmp_log_emission_baf = ( @@ -339,8 +350,10 @@ def hmrf_reassignment_posterior_concatenate( ) ) - logger.info("Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -406,15 +419,17 @@ def aggr_hmrf_reassignment_concatenate( The HMRF objective, which is the sum of log likelihood under the optimal labels plus the sum of edge potentials. """ - logger.info("Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = int(len(pred) / n_obs) n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - + posterior = np.zeros((N, n_clones)) for i in trange(N, desc="aggr_hmrf_reassignment_concatenate"): @@ -474,8 +489,10 @@ def aggr_hmrf_reassignment_concatenate( ) ) - logger.info("Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}.") - + logger.info( + "Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -492,7 +509,7 @@ def merge_by_minspots( threshold=0.5, ): logger.info("Merging by min. spots.") - + n_clones = len(np.unique(assignment)) if n_clones == 1: merged_groups = [[assignment[0]]] @@ -575,9 +592,9 @@ def merge_by_minspots( for c in merging_groups ] ) - + logger.info("Merged by min. spots.") - + return merging_groups, merged_res @@ -616,13 +633,13 @@ def hmrf_pipeline( spatial_weight=1.0, ): logger.info("Solving hmrf_pipeline.") - + n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) - + # NB checking input assert not (coords is None and adjacency_mat is None) - + if adjacency_mat is None: adjacency_mat = compute_adjacency_mat(coords, unit_xsquared, unit_ysquared) if sample_ids is None: @@ -636,14 +653,14 @@ def hmrf_pipeline( log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) logger.info("Merging pseudobulk by clone index") - + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index ) if (init_log_mu is None) or (init_p_binom is None): logger.info("Initializing HMM parameters by GMM") - + init_log_mu, init_p_binom = initialization_by_gmm( n_states, X, @@ -656,7 +673,7 @@ def hmrf_pipeline( ) else: logger.info("Using provided HMM initialization parameters") - + # NB initialization parameters for HMM if ("m" in params) and ("p" in params): last_log_mu = init_log_mu @@ -674,12 +691,12 @@ def hmrf_pipeline( last_assignment[idx] = c logger.info(f"Computing HMM for {max_iter_outer} iterations.") - + for r in range(max_iter_outer): # NB initialize with the parameters of last iteration if not Path(f"{outdir}/round{r}_nstates{n_states}_{params}.npz").exists(): logger.info(f"Computing HMM iteration {r}.") - + res = pipeline_baum_welch( None, X, @@ -704,13 +721,15 @@ def hmrf_pipeline( max_iter=max_iter, tol=tol, ) - + pred = np.argmax(res["log_gamma"], axis=0) - + # clone assignmment if nodepotential == "max": - logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment.") - + logger.info( + "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment." + ) + new_assignment, single_llf, total_llf = aggr_hmrf_reassignment( single_X, single_base_nb_mean, @@ -726,8 +745,10 @@ def hmrf_pipeline( hmmclass=hmmclass, ) elif nodepotential == "weighted_sum": - logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior.") - + logger.info( + "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior." + ) + new_assignment, single_llf, total_llf = hmrf_reassignment_posterior( single_X, single_base_nb_mean, @@ -743,26 +764,28 @@ def hmrf_pipeline( ) else: raise ValueError("Unknown mode for nodepotential!") - + # NB handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment remaining_clones = np.sort(np.unique(new_assignment)) re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) - + res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf - logger.info(f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz") - + logger.info( + f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz" + ) + np.savez(f"{outdir}/round{r}_nstates{n_states}_{params}.npz", **res) else: logger.info(f"Loading pre-computed HMM results for iteration {r}.") logger.info(f"Loading {outdir}/round{r}_nstates{n_states}_{params}.npz") - + res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz") logger.info(f"Regrouping to pseudobulk for iteration {r}.") @@ -771,7 +794,7 @@ def hmrf_pipeline( np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) ] - + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, clone_index ) @@ -801,26 +824,26 @@ def hmrf_pipeline( np.mean(np.abs(last_p_binom - res["new_p_binom"])), ) ) - + logger.info( "outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) - + if ( adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1 ): break - + last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] last_alphas = res["new_alphas"] last_taus = res["new_taus"] last_assignment = res["new_assignment"] log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) - + for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] this_persample_weight = np.bincount( @@ -932,7 +955,7 @@ def hmrf_concatenate_pipeline( logger.info(f"Computing HMM for {max_iter_outer} iterations.") for r in range(max_iter_outer): - # NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. + # NB assuming file f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" exists. # When r == 0, f"{outdir}/{prefix}_nstates{n_states}_{params}.npz" should # contain two keys: "num_iterations" and f"round_-1_assignment" for clone initialization logger.info(f"Loading {outdir}/{prefix}_nstates{n_states}_{params}.npz") @@ -993,7 +1016,9 @@ def hmrf_concatenate_pipeline( # NB HMRF clone assignmment if nodepotential == "max": - logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate.") + logger.info( + "Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate." + ) new_assignment, single_llf, total_llf = ( aggr_hmrf_reassignment_concatenate( @@ -1012,7 +1037,9 @@ def hmrf_concatenate_pipeline( ) ) elif nodepotential == "weighted_sum": - logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate.") + logger.info( + "Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate." + ) new_assignment, single_llf, total_llf = ( hmrf_reassignment_posterior_concatenate( @@ -1043,7 +1070,7 @@ def hmrf_concatenate_pipeline( ) res["log_gamma"] = res["log_gamma"][:, concat_idx] res["pred_cnv"] = res["pred_cnv"][concat_idx] - + res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf @@ -1059,10 +1086,12 @@ def hmrf_concatenate_pipeline( allres["num_iterations"] = r + 1 - logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz") + logger.info( + f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" + ) np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) - + logger.info(f"Regrouping to pseudobulk for iteration {r}.") clone_index = [ @@ -1072,7 +1101,7 @@ def hmrf_concatenate_pipeline( X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, clone_index ) - + if "mp" in params: logger.info( "outer iteration {}: difference between parameters = {}, {}".format( @@ -1218,9 +1247,10 @@ def aggr_hmrfmix_reassignment( + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) - # - # compute total log likelihood log P(X | Z) + log P(Z) + + # NB compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) + for i in range(N): total_llf += np.sum( spatial_weight @@ -1255,14 +1285,19 @@ def hmrfmix_reassignment_posterior( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + + logger.info( + f"Computing hmrfmix_reassignment_posterior for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) + lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) - # + posterior = np.zeros((N, n_clones)) for i in trange(N): idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] + for c in range(n_clones): if np.sum(single_base_nb_mean) > 0: this_pred_cnv = res["pred_cnv"][:, c] @@ -1277,6 +1312,7 @@ def hmrfmix_reassignment_posterior( } else: kwargs = {} + tmp_log_emission_rdr, tmp_log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:, :, idx], axis=2, keepdims=True), @@ -1290,6 +1326,7 @@ def hmrfmix_reassignment_posterior( **kwargs, ) ) + if ( np.sum(single_base_nb_mean[:, idx] > 0) > 0 and np.sum(single_total_bb_RD[:, idx] > 0) > 0 @@ -1299,7 +1336,7 @@ def hmrfmix_reassignment_posterior( * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) ) - # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) + single_llf[i, c] = ratio_nonzeros * np.sum( scipy.special.logsumexp( tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, :, c], @@ -1327,20 +1364,21 @@ def hmrfmix_reassignment_posterior( w_node = single_llf[i, :] w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) + for j in adjacency_mat[i, :].nonzero()[1]: if new_assignment[j] >= 0: - # w_edge[new_assignment[j]] += 1 w_edge[new_assignment[j]] += adjacency_mat[i, j] new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) - # + posterior[i, :] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) - # compute total log likelihood log P(X | Z) + log P(Z) + # NB compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) + for i in range(N): total_llf += np.sum( spatial_weight @@ -1348,6 +1386,9 @@ def hmrfmix_reassignment_posterior( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info(f"Computed hmrfmix_reassignment_posterior.") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -1949,7 +1990,7 @@ def hmrfmix_concatenate_pipeline( # NB baseline proportion of UMI counts lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) - + if (init_log_mu is None) or (init_p_binom is None): logger.info("Initializing HMM parameters by GMM") @@ -2057,10 +2098,12 @@ def hmrfmix_concatenate_pipeline( ) pred = np.argmax(res["log_gamma"], axis=0) - + # NB HMRF clone assignmment if nodepotential == "max": - logger.info("Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate.") + logger.info( + "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate." + ) new_assignment, single_llf, total_llf = ( aggr_hmrfmix_reassignment_concatenate( @@ -2080,7 +2123,9 @@ def hmrfmix_concatenate_pipeline( ) ) elif nodepotential == "weighted_sum": - logger.info("Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate.") + logger.info( + "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate." + ) new_assignment, single_llf, total_llf = ( hmrfmix_reassignment_posterior_concatenate( @@ -2126,10 +2171,12 @@ def hmrfmix_concatenate_pipeline( allres[f"round{r}_{k}"] = v allres["num_iterations"] = r + 1 - logger.info(f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz") + logger.info( + f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" + ) np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) - + logger.info(f"Regrouping to pseudobulk for iteration {r}.") clone_index = [ @@ -2144,7 +2191,7 @@ def hmrfmix_concatenate_pipeline( single_tumor_prop, threshold=tumorprop_threshold, ) - + if "mp" in params: logger.info( "outer iteration {}: difference between parameters = {}, {}".format( @@ -2165,7 +2212,7 @@ def hmrfmix_concatenate_pipeline( r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) ) ) - + logger.info( "outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) From ffe9f5d52dd0cdb53e8f7589ff8c2caa1761cff5 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:27:29 -0400 Subject: [PATCH 046/125] logging edits --- src/calicost/hmm_NB_BB_nophasing_v2.py | 6 +++--- src/calicost/hmrf.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index fe13c5f..ab60265 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -565,12 +565,9 @@ def run_baum_welch_nb_bb( new_p_binom = p_binom new_taus = taus - # check convergence logger.info( f"EM convergence metrics (v2): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}" ) - - logger.info(np.hstack([new_log_mu, new_p_binom])) if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol @@ -588,6 +585,9 @@ def run_baum_welch_nb_bb( logger.info("Computed Baum-Welch (v2).") + logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") + logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") + return ( new_log_mu, new_alphas, diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index cf1d68c..8d177dc 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -724,7 +724,7 @@ def hmrf_pipeline( pred = np.argmax(res["log_gamma"], axis=0) - # clone assignmment + # NB clone assignmment if nodepotential == "max": logger.info( "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment." From c75ec89d4ace3861f84a7ab2b2ce681dc6b4e70c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:30:24 -0400 Subject: [PATCH 047/125] log hmrfmix_reassignment_posterior_concatenate. --- src/calicost/hmrf.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 8d177dc..04f32bf 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1705,8 +1705,13 @@ def hmrfmix_reassignment_posterior_concatenate( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + + logger.info( + f"Computing hmrfmix_reassignment_posterior_concatenate for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) + lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) + if np.sum(single_base_nb_mean) > 0: logmu_shift = [] for c in range(n_clones): @@ -1729,7 +1734,7 @@ def hmrfmix_reassignment_posterior_concatenate( } else: kwargs = {} - # + posterior = np.zeros((N, n_clones)) for i in trange(N): @@ -1759,7 +1764,7 @@ def hmrfmix_reassignment_posterior_concatenate( * np.sum(single_total_bb_RD[:, i : (i + 1)] > 0) / np.sum(single_base_nb_mean[:, i : (i + 1)] > 0) ) - # ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0) + single_llf[i, c] = ratio_nonzeros * np.sum( scipy.special.logsumexp( tmp_log_emission_rdr[:, :, 0] @@ -1790,19 +1795,20 @@ def hmrfmix_reassignment_posterior_concatenate( w_node = single_llf[i, :] w_node += log_persample_weights[:, sample_ids[i]] w_edge = np.zeros(n_clones) + for j in adjacency_mat[i, :].nonzero()[1]: - # w_edge[new_assignment[j]] += 1 w_edge[new_assignment[j]] += adjacency_mat[i, j] new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) - # + posterior[i, :] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) - # compute total log likelihood log P(X | Z) + log P(Z) + # NB compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) + for i in range(N): total_llf += np.sum( spatial_weight @@ -1810,6 +1816,9 @@ def hmrfmix_reassignment_posterior_concatenate( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info(f"Computed hmrfmix_reassignment_posterior_concatenate.") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: From 8f3eb261be5f7a9af70727b0dbe0e1ff297b12ec Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:37:43 -0400 Subject: [PATCH 048/125] more logging improvements. remove deprecated code. --- src/calicost/hmm_NB_BB_phaseswitch.py | 134 +------------------------- src/calicost/hmrf.py | 18 ++-- 2 files changed, 14 insertions(+), 138 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index f1aefc9..d2683a4 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -859,8 +859,8 @@ def pipeline_baum_welch( init_log_mu = tmp_log_mu if (init_p_binom is None) and ("p" in params): init_p_binom = tmp_p_binom - print(f"init_log_mu = {init_log_mu}") - print(f"init_p_binom = {init_p_binom}") + + logger.info(f"Initial (mu, p):\n{np.hstack([init_log_mu, init_p_binom])}") # fit HMM-NB-BetaBinom # new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb(X, lengths, \ @@ -1426,11 +1426,12 @@ def combine_similar_states_across_clones( n_states = res["new_p_binom"].shape[0] reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states - # + all_test_statistics = compute_neymanpearson_stats( X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass ) - # make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small + + # NB make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small for c1 in range(n_clones): for c2 in range(c1 + 1, n_clones): list_t_neymanpearson = all_test_statistics[(c1, c2)] @@ -1455,128 +1456,3 @@ def combine_similar_states_across_clones( f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}" ) return res - - -# def similarity_components_rdrbaf_neymanpearson_posterior(X, base_nb_mean, total_bb_RD, res, threshold=2.0, minlength=10, topk=10, params="smp", tumor_prop=None, hmmclass=hmm_sitewise): -# n_obs = X.shape[0] -# n_states = res["new_p_binom"].shape[0] -# n_clones = X.shape[2] -# G = nx.Graph() -# G.add_nodes_from( np.arange(n_clones) ) -# # -# def eval_neymanpearson_bafonly(log_emission_baf_c1, log_gamma_c1, log_emission_baf_c2, log_gamma_c2, bidx, n_states, res, p): -# assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states -# # likelihood under the corresponding state -# llf_original = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + log_gamma_c1[:, bidx], axis=0), -# scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + log_gamma_c2[:, bidx], axis=0)) -# # likelihood under the switched state -# if log_emission_baf_c1.shape[0] == 2 * n_states: -# whether_switch = False -# pred_c1 = np.argmax(log_gamma_c1[:,bidx[0]]) -# pred_c2 = np.argmax(log_gamma_c2[:,bidx[0]]) -# if ( ((res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5)) ^ ((pred_c1 < n_states) == (pred_c2 < n_states)) ): -# whether_switch = True -# if not whether_switch: -# switch_log_gamma_c1 = log_gamma_c2 -# switch_log_gamma_c2 = log_gamma_c1 -# else: -# switch_log_gamma_c1 = np.vstack([log_gamma_c2[:n_states,:], log_gamma_c2[n_states:,:]]) -# switch_log_gamma_c2 = np.vstack([log_gamma_c1[:n_states,:], log_gamma_c1[n_states:,:]]) -# else: -# switch_log_gamma_c1 = log_gamma_c2 -# switch_log_gamma_c2 = log_gamma_c1 -# llf_switch = np.append(scipy.special.logsumexp(log_emission_baf_c1[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0), -# scipy.special.logsumexp(log_emission_baf_c2[:, bidx] + switch_log_gamma_c2[:, bidx], axis=0)) -# # log likelihood difference -# return np.mean(llf_original) - np.mean(llf_switch) -# # -# def eval_neymanpearson_rdrbaf(log_emission_rdr_c1, log_emission_baf_c1, log_gamma_c1, log_emission_rdr_c2, log_emission_baf_c2, log_gamma_c2, bidx, n_states, res, p): -# assert log_emission_baf_c1.shape[0] == n_states or log_emission_baf_c1.shape[0] == 2 * n_states -# # likelihood under the corresponding state -# llf_original = 0.5 * np.append(scipy.special.logsumexp((log_emission_rdr_c1+log_emission_baf_c1)[:, bidx] + log_gamma_c1[:, bidx], axis=0), \ -# scipy.special.logsumexp((log_emission_rdr_c2+log_emission_baf_c2)[:, bidx] + log_gamma_c2[:, bidx], axis=0)) -# # likelihood under the switched state -# if log_emission_baf_c1.shape[0] == 2 * n_states: -# whether_switch = False -# pred_c1 = np.argmax(log_gamma_c1[:,bidx[0]]) -# pred_c2 = np.argmax(log_gamma_c2[:,bidx[0]]) -# if ( ((res["new_p_binom"][p[0],0] > 0.5) == (res["new_p_binom"][p[1],0] > 0.5)) ^ ((pred_c1 < n_states) == (pred_c2 < n_states)) ): -# whether_switch = True -# if not whether_switch: -# switch_log_gamma_c1 = log_gamma_c2 -# switch_log_gamma_c2 = log_gamma_c1 -# else: -# switch_log_gamma_c1 = np.vstack([log_gamma_c2[:n_states,:], log_gamma_c2[n_states:,:]]) -# switch_log_gamma_c2 = np.vstack([log_gamma_c1[:n_states,:], log_gamma_c1[n_states:,:]]) -# else: -# switch_log_gamma_c1 = log_gamma_c2 -# switch_log_gamma_c2 = log_gamma_c1 -# llf_switch = 0.5 * np.append(scipy.special.logsumexp((log_emission_rdr_c1+log_emission_baf_c1)[:, bidx] + switch_log_gamma_c1[:, bidx], axis=0), \ -# scipy.special.logsumexp((log_emission_rdr_c2+log_emission_baf_c2)[:, bidx] + switch_log_gamma_c2[:, bidx], axis=0)) -# # log likelihood difference -# return np.mean(llf_original) - np.mean(llf_switch) -# # -# if tumor_prop is None: -# log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ -# base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ -# total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"]) -# else: -# log_emission_rdr, log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(np.vstack([X[:,0,:].flatten("F"), X[:,1,:].flatten("F")]).T.reshape(-1,2,1), \ -# base_nb_mean.flatten("F").reshape(-1,1), res["new_log_mu"], res["new_alphas"], \ -# total_bb_RD.flatten("F").reshape(-1,1), res["new_p_binom"], res["new_taus"], tumor_prop) -# log_emission_rdr = log_emission_rdr.reshape((log_emission_rdr.shape[0], n_obs, n_clones), order="F") -# log_emission_baf = log_emission_baf.reshape((log_emission_baf.shape[0], n_obs, n_clones), order="F") -# reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2],-1)) -# reshaped_pred_cnv = reshaped_pred % n_states -# reshaped_log_gamma = np.stack([ res["log_gamma"][:,(c*n_obs):(c*n_obs + n_obs)] for c in range(n_clones) ], axis=-1) -# for c1 in range(n_clones): -# for c2 in range(c1+1, n_clones): -# # unmergeable_bincount = 0 -# unique_pair_states = [x for x in np.unique(reshaped_pred_cnv[np.array([c1,c2]), :], axis=1).T if x[0] != x[1]] -# list_t_neymanpearson = [] -# for p in unique_pair_states: -# bidx = np.where( (reshaped_pred_cnv[c1,:]==p[0]) & (reshaped_pred_cnv[c2,:]==p[1]) )[0] -# if "m" in params and "p" in params: -# t_neymanpearson = eval_neymanpearson_rdrbaf(log_emission_rdr[:,:,c1], log_emission_baf[:,:,c1], reshaped_log_gamma[:,:,c1], log_emission_rdr[:,:,c2], log_emission_baf[:,:,c2], reshaped_log_gamma[:,:,c2], bidx, n_states, res, p) -# elif "p" in params: -# t_neymanpearson = eval_neymanpearson_bafonly(log_emission_baf[:,:,c1], reshaped_log_gamma[:,:,c1], log_emission_baf[:,:,c2], reshaped_log_gamma[:,:,c2], bidx, n_states, res, p) -# # if t_neymanpearson > threshold: -# # unmergeable_bincount += len(bidx) -# print(c1, c2, p, len(bidx), t_neymanpearson) -# if len(bidx) >= minlength: -# list_t_neymanpearson.append(t_neymanpearson) -# if len(list_t_neymanpearson) == 0 or np.max(list_t_neymanpearson) < threshold: -# max_v = np.max(list_t_neymanpearson) if len(list_t_neymanpearson) > 0 else 1e-3 -# G.add_weighted_edges_from([ (c1, c2, max_v) ]) -# # if unmergeable_bincount < topk: -# # G.add_edge(c1, c2) -# # maximal cliques -# cliques = [] -# for x in nx.find_cliques(G): -# this_len = len(x) -# this_weights = np.sum([G.get_edge_data(a,b)["weight"] for a in x for b in x if a != b]) / 2 -# cliques.append( (x, this_len, this_weights) ) -# cliques.sort(key = lambda x:(-x[1],x[2]) ) -# covered_nodes = set() -# merging_groups = [] -# for c in cliques: -# if len(set(c[0]) & covered_nodes) == 0: -# merging_groups.append( list(c[0]) ) -# covered_nodes = covered_nodes | set(c[0]) -# for c in range(n_clones): -# if not (c in covered_nodes): -# merging_groups.append( [c] ) -# covered_nodes.add(c) -# merging_groups.sort(key = lambda x:np.min(x)) -# # clone assignment after merging -# map_clone_id = {} -# for i,x in enumerate(merging_groups): -# for z in x: -# map_clone_id[z] = i -# new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) -# merged_res = copy.copy(res) -# merged_res["new_assignment"] = new_assignment -# merged_res["total_llf"] = np.NAN -# merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) -# merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) -# return merging_groups, merged_res diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 04f32bf..620a59a 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -801,7 +801,7 @@ def hmrf_pipeline( if "mp" in params: logger.info( - "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( + "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu - res["new_log_mu"])), @@ -810,7 +810,7 @@ def hmrf_pipeline( ) elif "m" in params: logger.info( - "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( + "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu - res["new_log_mu"])), @@ -818,7 +818,7 @@ def hmrf_pipeline( ) elif "p" in params: logger.info( - "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( + "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom - res["new_p_binom"])), @@ -826,7 +826,7 @@ def hmrf_pipeline( ) logger.info( - "outer iteration {}: ARI between assignment = {}".format( + "Outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) @@ -1631,7 +1631,7 @@ def hmrfmix_pipeline( # update last parameter if "mp" in params: print( - "outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( + "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu - res["new_log_mu"])), @@ -1640,7 +1640,7 @@ def hmrfmix_pipeline( ) elif "m" in params: print( - "outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( + "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_log_mu - res["new_log_mu"])), @@ -1648,18 +1648,18 @@ def hmrfmix_pipeline( ) elif "p" in params: print( - "outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( + "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom - res["new_p_binom"])), ) ) print( - "outer iteration {}: ARI between assignment = {}".format( + "Outer iteration {}: ARI between assignment = {}".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) - # if np.all( last_assignment == res["new_assignment"] ): + if ( adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1 From 32c4239289ef212378b661c76eae9bff48dcb7c9 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:41:02 -0400 Subject: [PATCH 049/125] add use defaults statement --- src/calicost/hmm_NB_BB_nophasing_v2.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index ab60265..caa94d0 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -338,35 +338,42 @@ def run_baum_welch_nb_bb( n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] + assert n_comp == 2 - logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse.") - log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu ) + p_binom = ( np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom ) + # NB initialize (inverse of) dispersion param in NB and BetaBinom alphas = ( 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas ) + taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus + use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) + + logger.info("Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).") + # NB initialize start probability and emission probability log_startprob = np.log(np.ones(n_states) / n_states) + if n_states > 1: transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: log_transmat = np.zeros((1, 1)) - # initialize log_gamma + log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None # NB a trick to speed up BetaBinom optimization: taking only unique From 346376916b44a631d8b5cf94af198bbdae6de0f5 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:50:54 -0400 Subject: [PATCH 050/125] edit logging for ARI and bb diff. --- src/calicost/hmrf.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 620a59a..994f75c 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -727,7 +727,7 @@ def hmrf_pipeline( # NB clone assignmment if nodepotential == "max": logger.info( - "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment." + "Assigning HMRF clone for iteration {r} with nodepotential=max & aggr_hmrfmix_reassignment." ) new_assignment, single_llf, total_llf = aggr_hmrf_reassignment( @@ -746,7 +746,7 @@ def hmrf_pipeline( ) elif nodepotential == "weighted_sum": logger.info( - "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior." + "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior." ) new_assignment, single_llf, total_llf = hmrf_reassignment_posterior( @@ -818,7 +818,7 @@ def hmrf_pipeline( ) elif "p" in params: logger.info( - "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( + "Outer iteration {}: total_llf = {}, BetaBinom parameters mean abs. diff. = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom - res["new_p_binom"])), @@ -826,7 +826,7 @@ def hmrf_pipeline( ) logger.info( - "Outer iteration {}: ARI between assignment = {}".format( + "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) @@ -1017,7 +1017,7 @@ def hmrf_concatenate_pipeline( # NB HMRF clone assignmment if nodepotential == "max": logger.info( - "Assigning HMRF clone with nodepotential=max & aggr_hmrf_reassignment_concatenate." + "Assigning HMRF clone for iteration {r} with nodepotential=max & aggr_hmrf_reassignment_concatenate." ) new_assignment, single_llf, total_llf = ( @@ -1038,7 +1038,7 @@ def hmrf_concatenate_pipeline( ) elif nodepotential == "weighted_sum": logger.info( - "Assigning HMRF clone with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate." + "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrf_reassignment_posterior_concatenate." ) new_assignment, single_llf, total_llf = ( @@ -1118,13 +1118,13 @@ def hmrf_concatenate_pipeline( ) elif "p" in params: logger.info( - "outer iteration {}: difference between BetaBinom parameters = {}".format( + "outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format( r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) ) ) logger.info( - "outer iteration {}: ARI between assignment = {}".format( + "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) @@ -1630,7 +1630,7 @@ def hmrfmix_pipeline( # update last parameter if "mp" in params: - print( + logger.info( "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( r, res["total_llf"], @@ -1639,7 +1639,7 @@ def hmrfmix_pipeline( ) ) elif "m" in params: - print( + logger.info( "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( r, res["total_llf"], @@ -1647,15 +1647,15 @@ def hmrfmix_pipeline( ) ) elif "p" in params: - print( - "Outer iteration {}: total_llf = {}, difference between BetaBinom parameters = {}".format( + logger.info( + "Outer iteration {}: total_llf = {}, BetaBinom mean abs. diff. = {}".format( r, res["total_llf"], np.mean(np.abs(last_p_binom - res["new_p_binom"])), ) ) - print( - "Outer iteration {}: ARI between assignment = {}".format( + logger.info( + "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) @@ -2111,7 +2111,7 @@ def hmrfmix_concatenate_pipeline( # NB HMRF clone assignmment if nodepotential == "max": logger.info( - "Assigning HMRF clone with nodepotential=max & aggr_hmrfmix_reassignment_concatenate." + "Assigning HMRF clone for iteration {r} with nodepotential=max & aggr_hmrfmix_reassignment_concatenate." ) new_assignment, single_llf, total_llf = ( @@ -2133,7 +2133,7 @@ def hmrfmix_concatenate_pipeline( ) elif nodepotential == "weighted_sum": logger.info( - "Assigning HMRF clone with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate." + "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate." ) new_assignment, single_llf, total_llf = ( @@ -2217,13 +2217,13 @@ def hmrfmix_concatenate_pipeline( ) elif "p" in params: logger.info( - "outer iteration {}: difference between BetaBinom parameters = {}".format( + "Outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format( r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) ) ) logger.info( - "outer iteration {}: ARI between assignment = {}".format( + "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( r, adjusted_rand_score(last_assignment, res["new_assignment"]) ) ) From b6a95ad7e3e234e212ea74eea3b763bab0459017 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 14:59:13 -0400 Subject: [PATCH 051/125] log neyman pearson. --- src/calicost/hmm_NB_BB_phaseswitch.py | 35 ++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index d2683a4..bc9b2a6 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -1236,11 +1236,14 @@ def similarity_components_rdrbaf_neymanpearson( n_obs = X.shape[0] n_states = res["new_p_binom"].shape[0] n_clones = X.shape[2] + + logger.info("Computing similarity_components_rdrbaf_neymanpearson for (n_obs, n_states, n_clones) = ({n_obs}, {n_states}, {n_clones}).") + G = nx.Graph() G.add_nodes_from(np.arange(n_clones)) - # + lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean) - # + if tumor_prop is None: log_emission_rdr, log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( @@ -1312,10 +1315,11 @@ def similarity_components_rdrbaf_neymanpearson( ) reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states + all_test_statistics = [] + for c1 in range(n_clones): for c2 in range(c1 + 1, n_clones): - # unmergeable_bincount = 0 unique_pair_states = [ x for x in np.unique(reshaped_pred_cnv[np.array([c1, c2]), :], axis=1).T @@ -1327,6 +1331,7 @@ def similarity_components_rdrbaf_neymanpearson( (reshaped_pred_cnv[c1, :] == p[0]) & (reshaped_pred_cnv[c2, :] == p[1]) )[0] + if "m" in params and "p" in params: t_neymanpearson = eval_neymanpearson_rdrbaf( log_emission_rdr[:, :, c1], @@ -1351,8 +1356,12 @@ def similarity_components_rdrbaf_neymanpearson( res, p, ) - print(c1, c2, p, len(bidx), t_neymanpearson) + + # TODO + logger.info(f"{c1}, {c2}, {p}, {len(bidx)}, {t_neymanpearson}") + all_test_statistics.append([c1, c2, p, t_neymanpearson]) + if len(bidx) >= minlength: list_t_neymanpearson.append(t_neymanpearson) if ( @@ -1365,8 +1374,11 @@ def similarity_components_rdrbaf_neymanpearson( else 1e-3 ) G.add_weighted_edges_from([(c1, c2, max_v)]) - # maximal cliques + + logger.info("Computing Maximal cliques.") + cliques = [] + for x in nx.find_cliques(G): this_len = len(x) this_weights = ( @@ -1374,23 +1386,31 @@ def similarity_components_rdrbaf_neymanpearson( / 2 ) cliques.append((x, this_len, this_weights)) + cliques.sort(key=lambda x: (-x[1], x[2])) + covered_nodes = set() merging_groups = [] + for c in cliques: if len(set(c[0]) & covered_nodes) == 0: merging_groups.append(list(c[0])) covered_nodes = covered_nodes | set(c[0]) + for c in range(n_clones): if not (c in covered_nodes): merging_groups.append([c]) covered_nodes.add(c) + merging_groups.sort(key=lambda x: np.min(x)) - # clone assignment after merging + + # NB clone assignment after merging map_clone_id = {} + for i, x in enumerate(merging_groups): for z in x: map_clone_id[z] = i + new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) merged_res = copy.copy(res) merged_res["new_assignment"] = new_assignment @@ -1407,6 +1427,9 @@ def similarity_components_rdrbaf_neymanpearson( for c in merging_groups ] ) + + logger.info("Computed similarity_components_rdrbaf_neymanpearson.") + return merging_groups, merged_res From 79796c63258b99d9f9857f44b058cabedd8bb0eb Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:05:09 -0400 Subject: [PATCH 052/125] improved logging. --- src/calicost/calicost_main.py | 1 + src/calicost/utils_IO.py | 45 +++++++++++++++++------------------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 9e52821..12be38d 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -288,6 +288,7 @@ def main(configuration_file): threshold=config["tumorprop_threshold"], ) + # TODO merging groups? logger.info( f"BAF clone merging after requiring minimum # spots: {merging_groups}" ) diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index d570a84..2f0ee47 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -1713,21 +1713,26 @@ def filter_de_genes_tri( df_bininfo : pd.DataFrame Contains columns ['CHR', 'START', 'END', 'INCLUDED_GENES', 'INCLUDED_SNP_IDS'], 'INCLUDED_GENES' contains space-delimited gene names. """ + + logger.info("Computing filter_de_genes_tri.") + adata = anndata.AnnData(exp_counts) adata.layers["count"] = exp_counts.values adata.obs["normal_candidate"] = normal_candidate - # + map_gene_adatavar = {} map_gene_umi = {} list_gene_umi = np.sum(adata.layers["count"], axis=0) + for i, x in enumerate(adata.var.index): map_gene_adatavar[x] = i map_gene_umi[x] = list_gene_umi[i] - # + if sample_list is None: sample_list = [None] - # + filtered_out_set = set() + for s, sname in enumerate(sample_list): if sname is None: index = np.arange(adata.shape[0]) @@ -1739,19 +1744,19 @@ def filter_de_genes_tri( < tmpadata.shape[1] * 10 ): continue - # + umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold ) - # - # sc.pp.filter_cells(tmpadata, min_genes=200) + sc.pp.filter_genes(tmpadata, min_cells=10) med = np.median(np.sum(tmpadata.layers["count"], axis=1)) - # sc.pp.normalize_total(tmpadata, target_sum=1e4) + sc.pp.normalize_total(tmpadata, target_sum=med) sc.pp.log1p(tmpadata) - # new added + sc.pp.pca(tmpadata, n_comps=4) + kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"]) kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"]) idx_kmeans_label = np.argmax( @@ -1761,23 +1766,13 @@ def filter_de_genes_tri( clone[ (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "tumor" + ### third part ### clone[ (kmeans_labels == idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "unsure" tmpadata.obs["clone"] = clone - # end added - # sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor", "unsure"], reference="normal", method='wilcoxon') - # # DE and log fold change comparing tumor and normal - # genenames_t = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["names"] ]) - # logfc_t = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ]) - # geneumis_t = np.array([ map_gene_umi[x] for x in genenames_t]) - # # DE and log fold change comparing unsure and normal - # genenames_u = np.array([ x[1] for x in tmpadata.uns["rank_genes_groups"]["names"] ]) - # logfc_u = np.array([ x[1] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ]) - # geneumis_u = np.array([ map_gene_umi[x] for x in genenames_u]) - # this_filtered_out_set = set(list(genenames_t[ (np.abs(logfc_t) > logfcthreshold) & (geneumis_t > umi_threshold) ])) | set(list(genenames_u[ (np.abs(logfc_u) > logfcthreshold) & (geneumis_u > umi_threshold) ])) - # + agg_counts = np.vstack( [ np.sum(tmpadata.layers["count"][tmpadata.obs["clone"] == c, :], axis=0) @@ -1810,10 +1805,12 @@ def filter_de_genes_tri( ) ) filtered_out_set = filtered_out_set | this_filtered_out_set - print(f"Filter out {len(filtered_out_set)} DE genes") - # - # remove genes that are in filtered_out_set + + logger.info(f"Filtered {len(filtered_out_set)} differentially expressed genes.") + + # NB remove genes that are in filtered_out_set new_single_X_rdr = np.zeros((df_bininfo.shape[0], adata.shape[0])) + for b, genestr in enumerate(df_bininfo.INCLUDED_GENES.values): # RDR (genes) involved_genes = set(genestr.split(" ")) - filtered_out_set @@ -1821,6 +1818,8 @@ def filter_de_genes_tri( adata.layers["count"][:, adata.var.index.isin(involved_genes)], axis=1 ) + logger.info("Computed filter_de_genes_tri.") + return new_single_X_rdr, filtered_out_set From 80f9c21ad16eaedda41545116f20d143be635140 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:10:02 -0400 Subject: [PATCH 053/125] update logging. --- src/calicost/hmrf.py | 2 +- src/calicost/utils_hmm.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 994f75c..baf00f6 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1967,7 +1967,7 @@ def hmrfmix_concatenate_pipeline( spatial_weight=1.0 / 6, tumorprop_threshold=0.5, ): - logger.info("Solving hmrfix_concatenate_pipeline.") + logger.info("Solving hmrfmix_concatenate_pipeline.") n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 95be54e..d16193a 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -1810,13 +1810,15 @@ def update_emission_params_bb_nophasing_uniqvalues( n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) - # initialization + + # NB initialization new_p_binom = ( copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 ) new_taus = copy.copy(taus) + if fix_BB_dispersion: for s in np.arange(len(unique_values)): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A @@ -1892,6 +1894,7 @@ def update_emission_params_bb_nophasing_uniqvalues( weights = [] features = [] state_posweights = [] + for s in np.arange(len(unique_values)): idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) @@ -1923,18 +1926,25 @@ def update_emission_params_bb_nophasing_uniqvalues( this_features[idx_row_posweight, :][:, idx_state_posweight] ) state_posweights.append(idx_state_posweight) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) + + + model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_taus[:, :] = res.params[-1] + if not (start_p_binom is None): res2 = model.fit( disp=0, @@ -1949,11 +1959,13 @@ def update_emission_params_bb_nophasing_uniqvalues( xtol=1e-4, ftol=1e-4, ) + if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_taus[:, :] = res2.params[-1] @@ -1995,7 +2007,8 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) - # initialization + + # NB initialization new_p_binom = ( copy.copy(start_p_binom) if not start_p_binom is None From bc446230a8cac0368bab00368d1d56d4e9dc2c25 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:15:50 -0400 Subject: [PATCH 054/125] update logging --- src/calicost/utils_hmm.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index d16193a..3e4ecf5 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -475,6 +475,7 @@ def update_emission_params_nb_sitewise_uniqvalues( if not start_log_mu is None else np.zeros((n_states, n_spots)) ) + new_alphas = copy.copy(alphas) # expression signal by NB distribution @@ -550,6 +551,7 @@ def update_emission_params_nb_sitewise_uniqvalues( weights = [] features = [] state_posweights = [] + for s in range(n_spots): idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] this_exposure = np.tile(unique_values[s][idx_nonzero, 1], n_states) @@ -585,21 +587,31 @@ def update_emission_params_nb_sitewise_uniqvalues( this_features[idx_row_posweight, :][:, idx_state_posweight] ) state_posweights.append(idx_state_posweight) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) + model = Weighted_NegativeBinomial( y, features, weights=weights, exposure=exposure ) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + + logger.info("Applying fit with default start params.") + + res = model.fit(disp=0, maxiter=1_500, xtol=1.e-4, ftol=1.e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_alphas[:, :] = res.params[-1] + if not (start_log_mu is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -614,8 +626,6 @@ def update_emission_params_nb_sitewise_uniqvalues( ftol=1e-4, ) - logger.info(f"") - nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) @@ -626,8 +636,10 @@ def update_emission_params_nb_sitewise_uniqvalues( l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_alphas[:, :] = res2.params[-1] + new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr From 64c691df06469c9b7ea8460fbddd7c37db35daa9 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:25:49 -0400 Subject: [PATCH 055/125] improve logging --- src/calicost/utils_hmm.py | 102 +++++++++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 13 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 3e4ecf5..0406cfb 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -679,13 +679,16 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( logger.info("Computing emission params for Negative Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.") - # initialization new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots)) ) + new_alphas = copy.copy(alphas) + + + # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) @@ -809,12 +812,13 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( ) state_posweights.append(idx_state_posweight) tp.append(this_tp[idx_row_posweight]) - # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) + model = Weighted_NegativeBinomial_mix( y, features, @@ -823,14 +827,22 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( tumor_prop=tp, penalty=0, ) + + logger.info("Applying fit with default start params.") + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_alphas[:, :] = res.params[-1] + if not (start_log_mu is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -844,13 +856,22 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) + new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_alphas[:, :] = res2.params[-1] + new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr @@ -890,13 +911,14 @@ def update_emission_params_bb_sitewise_uniqvalues( logger.info("Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") - # initialization new_p_binom = ( copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 ) + new_taus = copy.copy(taus) + if fix_BB_dispersion: for s in np.arange(len(unique_values)): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A @@ -1044,19 +1066,29 @@ def update_emission_params_bb_sitewise_uniqvalues( this_features[idx_row_posweight, :][:, idx_state_posweight] ) state_posweights.append(idx_state_posweight) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) + model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + + logger.info("Applying fit with default start params.") + + res = model.fit(disp=0, maxiter=1_500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_taus[:, :] = res.params[-1] + if not (start_p_binom is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -1070,13 +1102,21 @@ def update_emission_params_bb_sitewise_uniqvalues( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_taus[:, :] = res2.params[-1] + new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob @@ -1117,13 +1157,14 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( logger.info("Computing emission params for Beta Binomial Mix (sitewise, unique) for {n_spots} spots and {n_states} states.") - # initialization new_p_binom = ( copy.copy(start_p_binom) if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 ) + new_taus = copy.copy(taus) + if fix_BB_dispersion: for s in np.arange(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A @@ -1303,17 +1344,26 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) + model = Weighted_BetaBinom_mix( y, features, weights=weights, exposure=exposure, tumor_prop=tp ) + + logger.info("Applying fit with default start params.") + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_taus[:, :] = res.params[-1] + if not (start_p_binom is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -1327,13 +1377,21 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_taus[:, :] = res2.params[-1] + new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob @@ -1428,18 +1486,20 @@ def update_emission_params_nb_nophasing_uniqvalues( Mean expression under diploid state. """ - logger.info("Computing emission params for Negative Binomial (no phasing, unique).") + logger.info("Computing emission params for Negative Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.") n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) - # initialization + new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None else np.zeros((n_states, n_spots)) ) + new_alphas = copy.copy(alphas) + # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) @@ -1544,21 +1604,31 @@ def update_emission_params_nb_nophasing_uniqvalues( this_features[idx_row_posweight, :][:, idx_state_posweight] ) state_posweights.append(idx_state_posweight) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) + model = Weighted_NegativeBinomial( y, features, weights=weights, exposure=exposure ) + + logger.info("Applying fit with default start params.") + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_alphas[:, :] = res.params[-1] + if not (start_log_mu is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -1572,13 +1642,19 @@ def update_emission_params_nb_nophasing_uniqvalues( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_alphas[:, :] = res2.params[-1] + new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr @@ -1611,12 +1687,12 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ - logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique).") + logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) - # initialization + new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None From dddc8c9fd2f49d0af8d0b1dfba1ed96319737dfc Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:31:43 -0400 Subject: [PATCH 056/125] improved logging --- src/calicost/utils_hmm.py | 72 ++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 13 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 0406cfb..e2e4f8e 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -686,9 +686,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( ) new_alphas = copy.copy(alphas) - - - + # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) @@ -1019,6 +1017,7 @@ def update_emission_params_bb_sitewise_uniqvalues( weights = [] features = [] state_posweights = [] + for s in np.arange(len(unique_values)): idx_nonzero = np.where(unique_values[s][:, 1] > 0)[0] this_exposure = np.tile( @@ -1698,7 +1697,9 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( if not start_log_mu is None else np.zeros((n_states, n_spots)) ) + new_alphas = copy.copy(alphas) + # expression signal by NB distribution if fix_NB_dispersion: new_log_mu = np.zeros((n_states, n_spots)) @@ -1819,12 +1820,13 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( ) state_posweights.append(idx_state_posweight) tp.append(this_tp[idx_row_posweight]) - # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) + model = Weighted_NegativeBinomial_mix( y, features, @@ -1833,14 +1835,22 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( tumor_prop=tp, penalty=0, ) + + logger.info("Applying fit with default start params.") + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_alphas[:, :] = res.params[-1] + if not (start_log_mu is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -1854,13 +1864,21 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_log_mu[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_alphas[:, :] = res2.params[-1] + new_log_mu[new_log_mu > max_log_rdr] = max_log_rdr new_log_mu[new_log_mu < min_log_rdr] = min_log_rdr @@ -1893,7 +1911,7 @@ def update_emission_params_bb_nophasing_uniqvalues( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info("Computing emission params for Beta Binomial (no phasing, unique).") + logger.info("Computing emission params for Beta Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.") n_spots = len(unique_values) n_states = log_gamma.shape[0] @@ -1905,6 +1923,7 @@ def update_emission_params_bb_nophasing_uniqvalues( if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 ) + new_taus = copy.copy(taus) if fix_BB_dispersion: @@ -2020,9 +2039,10 @@ def update_emission_params_bb_nophasing_uniqvalues( weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) - - model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) + + logger.info("Applying fit with default start params.") + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) for s, idx_state_posweight in enumerate(state_posweights): @@ -2032,8 +2052,10 @@ def update_emission_params_bb_nophasing_uniqvalues( if res.params[-1] > 0: new_taus[:, :] = res.params[-1] - + if not (start_p_binom is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -2047,8 +2069,13 @@ def update_emission_params_bb_nophasing_uniqvalues( xtol=1e-4, ftol=1e-4, ) + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) @@ -2090,7 +2117,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique).") + logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") n_spots = len(unique_values) n_states = log_gamma.shape[0] @@ -2102,7 +2129,9 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( if not start_p_binom is None else np.ones((n_states, n_spots)) * 0.5 ) + new_taus = copy.copy(taus) + if fix_BB_dispersion: for s in np.arange(n_spots): tmp = (scipy.sparse.csr_matrix(gamma) @ mapping_matrices[s]).A @@ -2235,23 +2264,32 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( ) state_posweights.append(idx_state_posweight) tp.append(this_tp[idx_row_posweight]) - # tp.append( tumor_prop[s] * np.ones(len(idx_row_posweight)) ) + exposure = np.concatenate(exposure) y = np.concatenate(y) weights = np.concatenate(weights) features = scipy.linalg.block_diag(*features) tp = np.concatenate(tp) + model = Weighted_BetaBinom_mix( y, features, weights=weights, exposure=exposure, tumor_prop=tp ) + + logger.info("Applying fit with default start params.") + res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) + for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res.params[l1:l2] + if res.params[-1] > 0: new_taus[:, :] = res.params[-1] + if not (start_p_binom is None): + logger.info("Applying fit with custom start params.") + res2 = model.fit( disp=0, maxiter=1500, @@ -2265,13 +2303,21 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( xtol=1e-4, ftol=1e-4, ) - if model.nloglikeobs(res2.params) < model.nloglikeobs(res.params): + + nloglikeobs2 = model.nloglikeobs(res2.params) + nloglikeobs = model.nloglikeobs(res.params) + + logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + + if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): l1 = int(np.sum([len(x) for x in state_posweights[:s]])) l2 = int(np.sum([len(x) for x in state_posweights[: (s + 1)]])) new_p_binom[idx_state_posweight, s] = res2.params[l1:l2] + if res2.params[-1] > 0: new_taus[:, :] = res2.params[-1] + new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob From e807de09574738b7b1bf53ac710b43d8bb52f6bd Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:38:31 -0400 Subject: [PATCH 057/125] fix logging --- src/calicost/hmrf.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index baf00f6..7e11b3e 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -801,34 +801,19 @@ def hmrf_pipeline( if "mp" in params: logger.info( - "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( - r, - res["total_llf"], - np.mean(np.abs(last_log_mu - res["new_log_mu"])), - np.mean(np.abs(last_p_binom - res["new_p_binom"])), - ) + f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) elif "m" in params: logger.info( - "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( - r, - res["total_llf"], - np.mean(np.abs(last_log_mu - res["new_log_mu"])), - ) + f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}" ) elif "p" in params: logger.info( - "Outer iteration {}: total_llf = {}, BetaBinom parameters mean abs. diff. = {}".format( - r, - res["total_llf"], - np.mean(np.abs(last_p_binom - res["new_p_binom"])), - ) + f"Outer iteration {r}: total_llf = {res['total_llf']}, BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}" ) logger.info( - "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( - r, adjusted_rand_score(last_assignment, res["new_assignment"]) - ) + f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)" ) if ( From d1ae926da41e2aa3c7f3619f6f19d032c12fd646 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:41:08 -0400 Subject: [PATCH 058/125] edit logging --- src/calicost/hmrf.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 7e11b3e..78e8e42 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -2188,31 +2188,21 @@ def hmrfmix_concatenate_pipeline( if "mp" in params: logger.info( - "outer iteration {}: difference between parameters = {}, {}".format( - r, - np.mean(np.abs(last_log_mu - res["new_log_mu"])), - np.mean(np.abs(last_p_binom - res["new_p_binom"])), - ) + f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res["new_log_mu"]))}, {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}" ) elif "m" in params: logger.info( - "outer iteration {}: difference between NB parameters = {}".format( - r, np.mean(np.abs(last_log_mu - res["new_log_mu"])) - ) + f"Outer iteration {r}: mean abs. diff. between NB parameters = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}" ) elif "p" in params: logger.info( - "Outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format( - r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) - ) + f"Outer iteration {r}: BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) logger.info( - "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( - r, adjusted_rand_score(last_assignment, res["new_assignment"]) - ) + f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)" ) - # if np.all( last_assignment == res["new_assignment"] ): + if ( adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1 From 3501640613f77bf81c17dc4d71743837f93c78eb Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 15:46:53 -0400 Subject: [PATCH 059/125] fix logging --- src/calicost/hmrf.py | 46 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 78e8e42..8562e36 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1089,31 +1089,21 @@ def hmrf_concatenate_pipeline( if "mp" in params: logger.info( - "outer iteration {}: difference between parameters = {}, {}".format( - r, - np.mean(np.abs(last_log_mu - res["new_log_mu"])), - np.mean(np.abs(last_p_binom - res["new_p_binom"])), - ) + f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) elif "m" in params: logger.info( - "outer iteration {}: difference between NB parameters = {}".format( - r, np.mean(np.abs(last_log_mu - res["new_log_mu"])) - ) + f"Outer iteration {r}: mean abs. diff. (mu) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}" ) elif "p" in params: logger.info( - "outer iteration {}: BetaBinom parameters mean abs. diff. = {}".format( - r, np.mean(np.abs(last_p_binom - res["new_p_binom"])) - ) + f"Outer iteration {r}: mean abs. diff. (p) = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) logger.info( - "outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( - r, adjusted_rand_score(last_assignment, res["new_assignment"]) - ) + f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)" ) - # if np.all( last_assignment == res["new_assignment"] ): + if ( adjusted_rand_score(last_assignment, res["new_assignment"]) > 0.99 or len(np.unique(res["new_assignment"])) == 1 @@ -1616,33 +1606,19 @@ def hmrfmix_pipeline( # update last parameter if "mp" in params: logger.info( - "Outer iteration {}: total_llf = {}, difference between parameters = {}, {}".format( - r, - res["total_llf"], - np.mean(np.abs(last_log_mu - res["new_log_mu"])), - np.mean(np.abs(last_p_binom - res["new_p_binom"])), - ) + f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) elif "m" in params: logger.info( - "Outer iteration {}: total_llf = {}, difference between NB parameters = {}".format( - r, - res["total_llf"], - np.mean(np.abs(last_log_mu - res["new_log_mu"])), - ) + f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (mu) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}" ) elif "p" in params: logger.info( - "Outer iteration {}: total_llf = {}, BetaBinom mean abs. diff. = {}".format( - r, - res["total_llf"], - np.mean(np.abs(last_p_binom - res["new_p_binom"])), - ) + f"Outer iteration {r}: total_llf = {res['total_llf']}, mean abs. diff. (p) = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) + logger.info( - "Outer iteration {}: ARI between assignment = {} (unity is a perfect assignment)".format( - r, adjusted_rand_score(last_assignment, res["new_assignment"]) - ) + f"Outer iteration {r}: ARI between assignment = {adjusted_rand_score(last_assignment, res['new_assignment'])} (unity is a perfect assignment)" ) if ( @@ -1650,12 +1626,14 @@ def hmrfmix_pipeline( or len(np.unique(res["new_assignment"])) == 1 ): break + last_log_mu = res["new_log_mu"] last_p_binom = res["new_p_binom"] last_alphas = res["new_alphas"] last_taus = res["new_taus"] last_assignment = res["new_assignment"] log_persample_weights = np.ones((X.shape[2], n_samples)) * (-np.log(X.shape[2])) + for sidx in range(n_samples): index = np.where(sample_ids == sidx)[0] this_persample_weight = np.bincount( From cfad5411090a5f70d4f9e4f8fda49d68b856fbb9 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 16:00:26 -0400 Subject: [PATCH 060/125] fix logging --- src/calicost/hmrf.py | 48 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 8562e36..8b2de54 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -140,6 +140,8 @@ def hmrf_reassignment_posterior( "Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." ) + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -240,6 +242,8 @@ def aggr_hmrf_reassignment( "Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." ) + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -354,6 +358,8 @@ def hmrf_reassignment_posterior_concatenate( "Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." ) + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -493,6 +499,8 @@ def aggr_hmrf_reassignment_concatenate( "Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." ) + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -789,7 +797,8 @@ def hmrf_pipeline( res = np.load(f"{outdir}/round{r}_nstates{n_states}_{params}.npz") logger.info(f"Regrouping to pseudobulk for iteration {r}.") - + logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}") + clone_index = [ np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) @@ -1044,6 +1053,8 @@ def hmrf_concatenate_pipeline( else: raise ValueError("Unknown mode for nodepotential!") + logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}") + # NB handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment @@ -1233,6 +1244,9 @@ def aggr_hmrfmix_reassignment( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -1363,7 +1377,8 @@ def hmrfmix_reassignment_posterior( ) logger.info(f"Computed hmrfmix_reassignment_posterior.") - + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -1589,7 +1604,8 @@ def hmrfmix_pipeline( allres["num_iterations"] = r + 1 np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) - # regroup to pseudobulk + logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}") + clone_index = [ np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) @@ -1781,7 +1797,9 @@ def hmrfmix_reassignment_posterior_concatenate( ) logger.info(f"Computed hmrfmix_reassignment_posterior_concatenate.") - + + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -1810,11 +1828,11 @@ def aggr_hmrfmix_reassignment_concatenate( n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) - # + posterior = np.zeros((N, n_clones)) - # + for i in trange(N): idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] @@ -1887,6 +1905,9 @@ def aggr_hmrfmix_reassignment_concatenate( new_assignment[adjacency_mat[i, :].nonzero()[1]] == new_assignment[i] ) ) + + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -2118,6 +2139,8 @@ def hmrfmix_concatenate_pipeline( else: raise ValueError("Unknown mode for nodepotential!") + logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}") + # NB handle the case when one clone has zero spots if len(np.unique(new_assignment)) < X.shape[2]: res["assignment_before_reindex"] = new_assignment @@ -2129,11 +2152,11 @@ def hmrfmix_concatenate_pipeline( ) res["log_gamma"] = res["log_gamma"][:, concat_idx] res["pred_cnv"] = res["pred_cnv"][concat_idx] - # add to results + res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf - # append to allres + for k, v in res.items(): if k == "prev_assignment": allres[f"round{r-1}_assignment"] = v @@ -2141,6 +2164,7 @@ def hmrfmix_concatenate_pipeline( allres[f"round{r}_assignment"] = v else: allres[f"round{r}_{k}"] = v + allres["num_iterations"] = r + 1 logger.info( @@ -2155,6 +2179,7 @@ def hmrfmix_concatenate_pipeline( np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) ] + X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( single_X, single_base_nb_mean, @@ -2248,6 +2273,9 @@ def clonelabel_posterior_withinteger( spatial_weight : float """ + + logger.info("Computing clonelabel_posterior_withinteger.") + N = single_X.shape[2] n_obs = single_X.shape[0] # clone IDs @@ -2403,4 +2431,6 @@ def clonelabel_posterior_withinteger( - scipy.special.logsumexp(w_node + spatial_weight * w_edge) ) + logger.info("Computed clonelabel_posterior_withinteger.") + return df_posterior From a321ff611b43e6c86c06c142ae42ac91d5c07020 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 19:47:44 -0400 Subject: [PATCH 061/125] fix bugs --- src/calicost/hmrf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 8b2de54..6ff29d6 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -818,7 +818,7 @@ def hmrf_pipeline( ) elif "p" in params: logger.info( - f"Outer iteration {r}: total_llf = {res['total_llf']}, BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}" + f"Outer iteration {r}: total_llf = {res['total_llf']}, BetaBinom parameters mean abs. diff. = {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) logger.info( @@ -2191,7 +2191,7 @@ def hmrfmix_concatenate_pipeline( if "mp" in params: logger.info( - f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res["new_log_mu"]))}, {np.mean(np.abs(last_p_binom - res["new_p_binom"]))}" + f"Outer iteration {r}: mean abs. diff. (mu, p) = {np.mean(np.abs(last_log_mu - res['new_log_mu']))}, {np.mean(np.abs(last_p_binom - res['new_p_binom']))}" ) elif "m" in params: logger.info( From d4db18d1a69f864b94419537fc1a76dd4fdf2e14 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 20:03:48 -0400 Subject: [PATCH 062/125] fix bugs --- src/calicost/hmm_NB_BB_phaseswitch.py | 104 ++++++++++++++------------ src/calicost/utils_hmm.py | 4 +- 2 files changed, 59 insertions(+), 49 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index bc9b2a6..aaae1d7 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -70,7 +70,9 @@ def compute_emission_probability_nb_betabinom( log_emission : array, shape (2*n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ - logger.info("Computing emission probability for negative binomial & beta binomial (sitewise).") + logger.info( + "Computing emission probability for negative binomial & beta binomial (sitewise)." + ) n_obs = X.shape[0] n_comp = X.shape[1] @@ -113,7 +115,9 @@ def compute_emission_probability_nb_betabinom( ) ) - logger.info("Computed emission probability for negative binomial & beta binomial (sitewise).") + logger.info( + "Computed emission probability for negative binomial & beta binomial (sitewise)." + ) return log_emission_rdr, log_emission_baf @@ -158,7 +162,9 @@ def compute_emission_probability_nb_betabinom_mix( log_emission : array, shape (2*n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ - logger.info("Computing emission probability for *mixed* negative binomial & beta binomial (sitewise).") + logger.info( + "Computing emission probability for *mixed* negative binomial & beta binomial (sitewise)." + ) n_obs = X.shape[0] n_comp = X.shape[1] @@ -211,7 +217,9 @@ def compute_emission_probability_nb_betabinom_mix( mix_p_A * taus[i, s], ) - logger.info("Computed emission probability for *mixed* negative binomial & beta binomial (sitewise).") + logger.info( + "Computed emission probability for *mixed* negative binomial & beta binomial (sitewise)." + ) return log_emission_rdr, log_emission_baf @@ -381,7 +389,9 @@ def run_baum_welch_nb_bb( n_spots = X.shape[2] assert n_comp == 2 - logger.info("Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise).") + logger.info( + "Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise)." + ) log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T @@ -409,7 +419,7 @@ def run_baum_welch_nb_bb( else: log_transmat = np.zeros((1, 1)) - # NB a trick to speed up BetaBinom optimization: taking only unique values of + # NB a trick to speed up BetaBinom optimization: taking only unique values of # (B allele count, total SNP covering read count) unique_values_nb, mapping_matrices_nb = construct_unique_matrix( X[:, 0, :], base_nb_mean @@ -419,7 +429,9 @@ def run_baum_welch_nb_bb( ) for r in trange(max_iter, desc="EM algorithm (sitewise)"): - logger.info(f"Calculating E-step (sitewise) for iteration {r} of {max_iter}.") + logger.info( + f"Calculating E-step (sitewise) for iteration {r} of {max_iter}." + ) if tumor_prop is None: log_emission_rdr, log_emission_baf = ( @@ -465,7 +477,9 @@ def run_baum_welch_nb_bb( log_alpha, log_beta, log_transmat, log_emission ) - logger.info(f"Calculating M-step (sitewise) for iteration {r} of {max_iter}.") + logger.info( + f"Calculating M-step (sitewise) for iteration {r} of {max_iter}." + ) if "s" in self.params: new_log_startprob = update_startprob_sitewise(lengths, log_gamma) @@ -540,23 +554,20 @@ def run_baum_welch_nb_bb( else: new_p_binom = p_binom new_taus = taus - # check convergence + logger.info( - "EM convergence metrics (sitewise)", - np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob))), - np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))), - np.mean(np.abs(new_log_mu - log_mu)), - np.mean(np.abs(new_p_binom - p_binom)), + f"EM convergence metrics (sitewise): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}" ) - - logger.info(np.hstack([new_log_mu, new_p_binom])) - + + # logger.info(np.hstack([new_log_mu, new_p_binom])) + if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol ): break + log_startprob = new_log_startprob log_transmat = new_log_transmat log_mu = new_log_mu @@ -566,6 +577,9 @@ def run_baum_welch_nb_bb( logger.info("Computed Baum-Welch (sitewise).") + logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") + logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") + return ( new_log_mu, new_alphas, @@ -860,15 +874,9 @@ def pipeline_baum_welch( if (init_p_binom is None) and ("p" in params): init_p_binom = tmp_p_binom - logger.info(f"Initial (mu, p):\n{np.hstack([init_log_mu, init_p_binom])}") + logger.info(f"Initial mu:\n{init_log_mu}") + logger.info(f"Initial p:\n{init_p_binom}") - # fit HMM-NB-BetaBinom - # new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat = hmmmodel.run_baum_welch_nb_bb(X, lengths, \ - # n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop, \ - # fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ - # fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, \ - # is_diag=is_diag, init_log_mu=init_log_mu, init_p_binom=init_p_binom, init_alphas=init_alphas, init_taus=init_taus, \ - # max_iter=max_iter, tol=tol) hmmmodel = hmmclass(params=params, t=t) remain_kwargs = { k: v for k, v in kwargs.items() if k in ["lambd", "sample_length", "log_gamma"] @@ -1237,13 +1245,15 @@ def similarity_components_rdrbaf_neymanpearson( n_states = res["new_p_binom"].shape[0] n_clones = X.shape[2] - logger.info("Computing similarity_components_rdrbaf_neymanpearson for (n_obs, n_states, n_clones) = ({n_obs}, {n_states}, {n_clones}).") - + logger.info( + "Computing similarity_components_rdrbaf_neymanpearson for (n_obs, n_states, n_clones) = ({n_obs}, {n_states}, {n_clones})." + ) + G = nx.Graph() G.add_nodes_from(np.arange(n_clones)) - + lambd = np.sum(base_nb_mean, axis=1) / np.sum(base_nb_mean) - + if tumor_prop is None: log_emission_rdr, log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( @@ -1315,9 +1325,9 @@ def similarity_components_rdrbaf_neymanpearson( ) reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states - + all_test_statistics = [] - + for c1 in range(n_clones): for c2 in range(c1 + 1, n_clones): unique_pair_states = [ @@ -1331,7 +1341,7 @@ def similarity_components_rdrbaf_neymanpearson( (reshaped_pred_cnv[c1, :] == p[0]) & (reshaped_pred_cnv[c2, :] == p[1]) )[0] - + if "m" in params and "p" in params: t_neymanpearson = eval_neymanpearson_rdrbaf( log_emission_rdr[:, :, c1], @@ -1359,9 +1369,9 @@ def similarity_components_rdrbaf_neymanpearson( # TODO logger.info(f"{c1}, {c2}, {p}, {len(bidx)}, {t_neymanpearson}") - + all_test_statistics.append([c1, c2, p, t_neymanpearson]) - + if len(bidx) >= minlength: list_t_neymanpearson.append(t_neymanpearson) if ( @@ -1376,9 +1386,9 @@ def similarity_components_rdrbaf_neymanpearson( G.add_weighted_edges_from([(c1, c2, max_v)]) logger.info("Computing Maximal cliques.") - + cliques = [] - + for x in nx.find_cliques(G): this_len = len(x) this_weights = ( @@ -1386,31 +1396,31 @@ def similarity_components_rdrbaf_neymanpearson( / 2 ) cliques.append((x, this_len, this_weights)) - + cliques.sort(key=lambda x: (-x[1], x[2])) - + covered_nodes = set() merging_groups = [] - + for c in cliques: if len(set(c[0]) & covered_nodes) == 0: merging_groups.append(list(c[0])) covered_nodes = covered_nodes | set(c[0]) - + for c in range(n_clones): if not (c in covered_nodes): merging_groups.append([c]) covered_nodes.add(c) - + merging_groups.sort(key=lambda x: np.min(x)) - + # NB clone assignment after merging map_clone_id = {} - + for i, x in enumerate(merging_groups): for z in x: map_clone_id[z] = i - + new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) merged_res = copy.copy(res) merged_res["new_assignment"] = new_assignment @@ -1429,7 +1439,7 @@ def similarity_components_rdrbaf_neymanpearson( ) logger.info("Computed similarity_components_rdrbaf_neymanpearson.") - + return merging_groups, merged_res @@ -1449,11 +1459,11 @@ def combine_similar_states_across_clones( n_states = res["new_p_binom"].shape[0] reshaped_pred = np.argmax(res["log_gamma"], axis=0).reshape((X.shape[2], -1)) reshaped_pred_cnv = reshaped_pred % n_states - + all_test_statistics = compute_neymanpearson_stats( X, base_nb_mean, total_bb_RD, res, params, tumor_prop, hmmclass ) - + # NB make the pair of states consistent between clone c1 and clone c2 if their t_neymanpearson test statistics is small for c1 in range(n_clones): for c2 in range(c1 + 1, n_clones): diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index e2e4f8e..21ab295 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -907,7 +907,7 @@ def update_emission_params_bb_sitewise_uniqvalues( n_states = int(log_gamma.shape[0] / 2) gamma = np.exp(log_gamma) - logger.info("Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") + logger.info(f"Computing emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") new_p_binom = ( copy.copy(start_p_binom) @@ -1119,7 +1119,7 @@ def update_emission_params_bb_sitewise_uniqvalues( new_p_binom[new_p_binom < min_binom_prob] = min_binom_prob new_p_binom[new_p_binom > max_binom_prob] = max_binom_prob - logger.info("Computed emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") + logger.info(f"Computed emission params for Beta Binomial (sitewise, unique) for {n_spots} spots and {n_states} states.") return new_p_binom, new_taus From 94c5a1575cfbd477452f41b720249b892bbee623 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Fri, 16 Aug 2024 20:32:02 -0400 Subject: [PATCH 063/125] fix --- src/calicost/hmm_NB_BB_phaseswitch.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index aaae1d7..2a718cd 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -70,15 +70,15 @@ def compute_emission_probability_nb_betabinom( log_emission : array, shape (2*n_states, n_obs, n_spots) Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ - logger.info( - "Computing emission probability for negative binomial & beta binomial (sitewise)." - ) - n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] n_states = log_mu.shape[0] - # initialize log_emission + + logger.info( + "Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}." + ) + log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots)) log_emission_baf = np.zeros((2 * n_states, n_obs, n_spots)) for i in np.arange(n_states): From 776eeb0027975d6387d6746ec05535455d91648a Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 15:08:25 -0400 Subject: [PATCH 064/125] logging fixes --- src/calicost/utils_distribution_fitting.py | 2 +- src/calicost/utils_hmm.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index b343868..61d2835 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -261,7 +261,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) - logger.info(f"Starting Weighted_BetaBinom_mix optimization with @ {start_params}.") + logger.info(f"Starting Weighted_BetaBinom_mix optimization @ {start_params}.") start = time.time() diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 21ab295..427a777 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -629,7 +629,7 @@ def update_emission_params_nb_sitewise_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -858,7 +858,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1105,7 +1105,7 @@ def update_emission_params_bb_sitewise_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1380,7 +1380,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1868,7 +1868,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -2073,7 +2073,7 @@ def update_emission_params_bb_nophasing_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -2307,7 +2307,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for params2 of {nloglikeobs2:.6e} to params1 {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): From 4d5466a3ff34f1f253f4b334c0a1250d6220a4da Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 15:20:27 -0400 Subject: [PATCH 065/125] fix --- src/calicost/hmm_NB_BB_phaseswitch.py | 2 +- src/calicost/utils_hmm.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 2a718cd..6e263f9 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -76,7 +76,7 @@ def compute_emission_probability_nb_betabinom( n_states = log_mu.shape[0] logger.info( - "Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}." + f"Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}." ) log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots)) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 427a777..54fe441 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -1686,7 +1686,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ - logger.info("Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") + logger.info(f"Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") n_spots = len(unique_values) n_states = log_gamma.shape[0] @@ -2117,7 +2117,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info("Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") + logger.info(f"Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") n_spots = len(unique_values) n_states = log_gamma.shape[0] From 11f7c93833b11796290370a826b7f12454d1feb1 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 15:32:08 -0400 Subject: [PATCH 066/125] add Baum-Welch log lines. --- src/calicost/hmm_NB_BB_phaseswitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 6e263f9..700c7c0 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -428,7 +428,8 @@ def run_baum_welch_nb_bb( X[:, 1, :], total_bb_RD ) - for r in trange(max_iter, desc="EM algorithm (sitewise)"): + for r in range(max_iter): + logger.info("-" * 250) logger.info( f"Calculating E-step (sitewise) for iteration {r} of {max_iter}." ) @@ -579,6 +580,8 @@ def run_baum_welch_nb_bb( logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") + + logger.info("-" * 250) return ( new_log_mu, From 9d709017a72be464e7a97d20131c18cdc2be9c6f Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 15:44:52 -0400 Subject: [PATCH 067/125] fix --- src/calicost/utils_hmm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 54fe441..2b5c4c0 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -629,7 +629,7 @@ def update_emission_params_nb_sitewise_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Negative Binomial with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -858,7 +858,7 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Negative Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1105,7 +1105,7 @@ def update_emission_params_bb_sitewise_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1380,7 +1380,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1868,7 +1868,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Negative Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -2073,7 +2073,7 @@ def update_emission_params_bb_nophasing_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -2307,7 +2307,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): From c91591d9955d0789fb6ec2f11a16487b1371e6d6 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 17:14:57 -0400 Subject: [PATCH 068/125] fix logging. --- src/calicost/utils_distribution_fitting.py | 43 +++++++++++++++++----- src/calicost/utils_hmm.py | 34 +---------------- 2 files changed, 35 insertions(+), 42 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 61d2835..1ca6d9a 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -82,10 +82,15 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params + start_params_str = "existing" + else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) + start_params_str = "default" + else: + start_params_str = "input" - logger.info(f"Starting Weighted_NegativeBinomial optimization @ {start_params}.") + logger.info(f"Starting Weighted_NegativeBinomial optimization @ ({start_params_str}) {start_params}.") start = time.time() @@ -136,10 +141,14 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params + start_params_str = "existing" else: start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - - logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ {start_params}.") + start_params_str = "default" + else: + start_params_str = "input" + + logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ ({start_params_str}) {start_params}.") start = time.time() @@ -201,12 +210,16 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params + start_params_str = "existing" else: start_params = np.append( 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) + start_params_str = "default" + else: + start_params_str = "input" - logger.info(f"Starting Weighted_BetaBinomial optimization @ {start_params}.") + logger.info(f"Starting Weighted_BetaBinomial optimization @ ({start_params_str}) {start_params}.") start = time.time() @@ -256,12 +269,16 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params + start_params_str = "existing" else: start_params = np.append( 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) + start_params_str = "default" + else: + start_params_str = "input" - logger.info(f"Starting Weighted_BetaBinom_mix optimization @ {start_params}.") + logger.info(f"Starting Weighted_BetaBinom_mix optimization @ ({start_params_str}) {start_params}.") start = time.time() @@ -304,10 +321,14 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params + start_params_str = "existing" else: start_params = 0.1 * np.ones(self.nparams) - - logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ {start_params}.") + start_params_str = "default" + else: + start_params_str = "input" + + logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ ({start_params_str}) {start_params}.") start = time.time() @@ -356,10 +377,14 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params + start_params_str = "existing" else: start_params = 0.1 * np.ones(self.nparams) - - logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ {start_params}.") + start_params_str = "default" + else: + start_params_str = "input" + + logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ ({start_params_str}) {start_params}.") start = time.time() diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 2b5c4c0..b9ccb84 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -596,8 +596,6 @@ def update_emission_params_nb_sitewise_uniqvalues( model = Weighted_NegativeBinomial( y, features, weights=weights, exposure=exposure ) - - logger.info("Applying fit with default start params.") res = model.fit(disp=0, maxiter=1_500, xtol=1.e-4, ftol=1.e-4) @@ -610,8 +608,6 @@ def update_emission_params_nb_sitewise_uniqvalues( new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, @@ -825,8 +821,6 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( tumor_prop=tp, penalty=0, ) - - logger.info("Applying fit with default start params.") res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) @@ -839,8 +833,6 @@ def update_emission_params_nb_sitewise_uniqvalues_mix( new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, @@ -1072,8 +1064,6 @@ def update_emission_params_bb_sitewise_uniqvalues( features = scipy.linalg.block_diag(*features) model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) - - logger.info("Applying fit with default start params.") res = model.fit(disp=0, maxiter=1_500, xtol=1e-4, ftol=1e-4) @@ -1086,8 +1076,6 @@ def update_emission_params_bb_sitewise_uniqvalues( new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, @@ -1347,8 +1335,6 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( model = Weighted_BetaBinom_mix( y, features, weights=weights, exposure=exposure, tumor_prop=tp ) - - logger.info("Applying fit with default start params.") res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) @@ -1360,9 +1346,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( if res.params[-1] > 0: new_taus[:, :] = res.params[-1] - if not (start_p_binom is None): - logger.info("Applying fit with custom start params.") - + if not (start_p_binom is None): res2 = model.fit( disp=0, maxiter=1500, @@ -1612,8 +1596,6 @@ def update_emission_params_nb_nophasing_uniqvalues( model = Weighted_NegativeBinomial( y, features, weights=weights, exposure=exposure ) - - logger.info("Applying fit with default start params.") res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) @@ -1626,8 +1608,6 @@ def update_emission_params_nb_nophasing_uniqvalues( new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, @@ -1835,8 +1815,6 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( tumor_prop=tp, penalty=0, ) - - logger.info("Applying fit with default start params.") res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) @@ -1849,8 +1827,6 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( new_alphas[:, :] = res.params[-1] if not (start_log_mu is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, @@ -2041,8 +2017,6 @@ def update_emission_params_bb_nophasing_uniqvalues( model = Weighted_BetaBinom(y, features, weights=weights, exposure=exposure) - logger.info("Applying fit with default start params.") - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) for s, idx_state_posweight in enumerate(state_posweights): @@ -2054,8 +2028,6 @@ def update_emission_params_bb_nophasing_uniqvalues( new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, @@ -2275,8 +2247,6 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( y, features, weights=weights, exposure=exposure, tumor_prop=tp ) - logger.info("Applying fit with default start params.") - res = model.fit(disp=0, maxiter=1500, xtol=1e-4, ftol=1e-4) for s, idx_state_posweight in enumerate(state_posweights): @@ -2288,8 +2258,6 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( new_taus[:, :] = res.params[-1] if not (start_p_binom is None): - logger.info("Applying fit with custom start params.") - res2 = model.fit( disp=0, maxiter=1500, From 2003e04b856d60337e8948aae39974fdcc9f1bd8 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 17:19:24 -0400 Subject: [PATCH 069/125] log bandwidth --- src/calicost/utils_hmrf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py index 13c6830..0619ceb 100644 --- a/src/calicost/utils_hmrf.py +++ b/src/calicost/utils_hmrf.py @@ -104,9 +104,11 @@ def choose_adjacency_by_readcounts( adjacency_mat.setdiag(1) adjacency_mat = adjacency_mat - smooth_mat adjacency_mat[adjacency_mat < 0] = 0 + if np.median(np.sum(adjacency_mat, axis=0).A.flatten()) >= 6: - print(f"bandwidth: {bandwidth}") + logger.info(f"Bandwidth={bandwidth}") break + return smooth_mat, adjacency_mat From 27603ecf64bb10e8cf246ef6f05974b95bf578ca Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 17:25:05 -0400 Subject: [PATCH 070/125] fix logging --- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmrf.py | 2 +- src/calicost/parse_input.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index caa94d0..701d63f 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -362,7 +362,7 @@ def run_baum_welch_nb_bb( use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) - logger.info("Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).") + logger.info(f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).") # NB initialize start probability and emission probability log_startprob = np.log(np.ones(n_states) / n_states) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 6ff29d6..fbeac27 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -2117,7 +2117,7 @@ def hmrfmix_concatenate_pipeline( ) elif nodepotential == "weighted_sum": logger.info( - "Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate." + f"Assigning HMRF clone for iteration {r} with nodepotential=weighted_sum & hmrfmix_reassignment_posterior_concatenate." ) new_assignment, single_llf, total_llf = ( diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py index 49221c5..f723e4a 100644 --- a/src/calicost/parse_input.py +++ b/src/calicost/parse_input.py @@ -264,7 +264,8 @@ def parse_visium(config): construct_adjacency_w=config["construct_adjacency_w"], ) n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten()) - print(f"Set up number of spots to pool in HMRF: {n_pooled}") + + logger.info(f"Set up number of spots to pool in HMRF: {n_pooled}") # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data) # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization From fafef1faa69e92e6bf35b4308ff793a593f051d2 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 18:43:25 -0400 Subject: [PATCH 071/125] fix --- src/calicost/hmm_NB_BB_nophasing.py | 2 +- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmm_NB_BB_phaseswitch.py | 1 - src/calicost/utils_hmm.py | 1 - src/calicost/utils_phase_switch.py | 1 - 5 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index b546989..f3e3251 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -363,7 +363,7 @@ def run_baum_welch_nb_bb( X[:, 1, :], total_bb_RD ) - for r in trange(max_iter, desc="EM algorithm"): + for r in range(max_iter): logger.info(f"Calculating E-step for iteration {r} of {max_iter}.") if tumor_prop is None: diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 701d63f..941008e 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -385,7 +385,7 @@ def run_baum_welch_nb_bb( X[:, 1, :], total_bb_RD ) - for r in trange(max_iter, desc="EM algorithm"): + for r in range(max_iter): logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.") if tumor_prop is None: diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 700c7c0..7492c47 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -6,7 +6,6 @@ from scipy.optimize import minimize from scipy.optimize import Bounds from sklearn.mixture import GaussianMixture -from tqdm import trange import statsmodels.api as sm from statsmodels.base.model import GenericLikelihoodModel import copy diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index b9ccb84..8eb96c0 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -3,7 +3,6 @@ import copy import scipy.special from numba import njit -from tqdm import trange from sklearn.mixture import GaussianMixture from calicost.utils_distribution_fitting import * diff --git a/src/calicost/utils_phase_switch.py b/src/calicost/utils_phase_switch.py index 2b30fa3..3b1007d 100644 --- a/src/calicost/utils_phase_switch.py +++ b/src/calicost/utils_phase_switch.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd from pathlib import Path -from tqdm import trange import scipy import scipy.special From ee1810e1965320f05b1fbfa976e372028c9cb1a8 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 18:46:31 -0400 Subject: [PATCH 072/125] fix logging --- src/calicost/hmm_NB_BB_nophasing.py | 2 +- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmm_NB_BB_phaseswitch.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index f3e3251..0a436b9 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -507,7 +507,7 @@ def run_baum_welch_nb_bb( p_binom = new_p_binom taus = new_taus - logger.info("Computed Baum-Welch (v2).") + logger.info("Computed Baum-Welch (v2) in {r+1} iterations.") return ( new_log_mu, diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 941008e..a360736 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -590,7 +590,7 @@ def run_baum_welch_nb_bb( p_binom = new_p_binom taus = new_taus - logger.info("Computed Baum-Welch (v2).") + logger.info("Computed Baum-Welch (v2) in {r+1} iterations.") logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 7492c47..b0b741c 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -575,7 +575,7 @@ def run_baum_welch_nb_bb( p_binom = new_p_binom taus = new_taus - logger.info("Computed Baum-Welch (sitewise).") + logger.info("Computed Baum-Welch (sitewise) in {r+1} iterations.") logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") @@ -1487,7 +1487,7 @@ def combine_similar_states_across_clones( ] = res["pred_cnv"][(c_keep * n_obs) : (c_keep * n_obs + n_obs)][ bidx ] - print( + logger.info( f"Merging states {[p1,p2]} in clone {c1} and clone {c2}. NP statistics = {t_neymanpearson}" ) return res From 03db264a41d1cae2b09df67c921c6c25270ff80c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 18:51:04 -0400 Subject: [PATCH 073/125] fix logging bug --- src/calicost/utils_hmm.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index 8eb96c0..e953f90 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -1467,13 +1467,12 @@ def update_emission_params_nb_nophasing_uniqvalues( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ - - logger.info("Computing emission params for Negative Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.") - n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) + logger.info("Computing emission params for Negative Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.") + new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None @@ -1665,12 +1664,12 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( base_nb_mean : array, shape (n_observations, n_spots) Mean expression under diploid state. """ - logger.info(f"Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") - n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) + logger.info(f"Computing emission params for Negative Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") + new_log_mu = ( copy.copy(start_log_mu) if not start_log_mu is None @@ -1886,11 +1885,11 @@ def update_emission_params_bb_nophasing_uniqvalues( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info("Computing emission params for Beta Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.") - n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) + + logger.info("Computing emission params for Beta Binomial (no phasing, unique) with {n_spots} spots and {n_states} states.") # NB initialization new_p_binom = ( @@ -2088,11 +2087,11 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( total_bb_RD : array, shape (n_observations, n_spots) SNP-covering reads for both REF and ALT across genes along genome. """ - logger.info(f"Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") - n_spots = len(unique_values) n_states = log_gamma.shape[0] gamma = np.exp(log_gamma) + + logger.info(f"Computing emission params for Beta Binomial Mix (no phasing, unique) with {n_spots} spots and {n_states} states.") # NB initialization new_p_binom = ( From cef4eb2e65163c5038ab5de20081fdc9a7f48f76 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 18:59:07 -0400 Subject: [PATCH 074/125] add TODO for gammas in phasing beta binomial --- src/calicost/hmm_NB_BB_phaseswitch.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index b0b741c..b4474b1 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -80,6 +80,7 @@ def compute_emission_probability_nb_betabinom( log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots)) log_emission_baf = np.zeros((2 * n_states, n_obs, n_spots)) + for i in np.arange(n_states): for s in np.arange(n_spots): # expression from NB distribution @@ -94,22 +95,28 @@ def compute_emission_probability_nb_betabinom( log_emission_rdr[i + n_states, idx_nonzero_rdr, s] = ( log_emission_rdr[i, idx_nonzero_rdr, s] ) + # AF from BetaBinom distribution idx_nonzero_baf = np.where(total_bb_RD[:, s] > 0)[0] + if len(idx_nonzero_baf) > 0: log_emission_baf[i, idx_nonzero_baf, s] = ( scipy.stats.betabinom.logpmf( X[idx_nonzero_baf, 1, s], total_bb_RD[idx_nonzero_baf, s], p_binom[i, s] * taus[i, s], - (1 - p_binom[i, s]) * taus[i, s], + (1. - p_binom[i, s]) * taus[i, s], ) ) + + # TODO + # log_emission_baf[i, idx_nonzero_baf, s] - scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) - scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) + log_emission_baf[i + n_states, idx_nonzero_baf, s] = ( scipy.stats.betabinom.logpmf( X[idx_nonzero_baf, 1, s], total_bb_RD[idx_nonzero_baf, s], - (1 - p_binom[i, s]) * taus[i, s], + (1. - p_binom[i, s]) * taus[i, s], p_binom[i, s] * taus[i, s], ) ) From fbbb9b5bb4dfe9bcd969cb51f31b7d056d611652 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 19:01:31 -0400 Subject: [PATCH 075/125] fix --- src/calicost/hmm_NB_BB_phaseswitch.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index b4474b1..0d08274 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -75,7 +75,7 @@ def compute_emission_probability_nb_betabinom( n_states = log_mu.shape[0] logger.info( - f"Computing emission probability for negative binomial & beta binomial (sitewise) with n_spots and n_states = {n_spots} and {n_states}." + f"Computing emission probability for negative binomial & beta binomial (sitewise, phaseswitch) with n_spots and n_states = {n_spots} and {n_states}." ) log_emission_rdr = np.zeros((2 * n_states, n_obs, n_spots)) @@ -110,7 +110,8 @@ def compute_emission_probability_nb_betabinom( ) # TODO - # log_emission_baf[i, idx_nonzero_baf, s] - scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) - scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) + # log_emission_baf[i, idx_nonzero_baf, s] - scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) - scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + # + scipy.special.loggamma(X[idx_nonzero_baf, 1, s] + (1. - p_binom[i, s]) * taus[i, s]) + scipy.special.loggamma(total_bb_RD[idx_nonzero_baf, s] - X[idx_nonzero_baf, 1, s] + p_binom[i, s] * taus[i, s]) log_emission_baf[i + n_states, idx_nonzero_baf, s] = ( scipy.stats.betabinom.logpmf( @@ -122,7 +123,7 @@ def compute_emission_probability_nb_betabinom( ) logger.info( - "Computed emission probability for negative binomial & beta binomial (sitewise)." + "Computed emission probability for negative binomial & beta binomial (sitewise, phaseswitch)." ) return log_emission_rdr, log_emission_baf @@ -169,7 +170,7 @@ def compute_emission_probability_nb_betabinom_mix( Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. """ logger.info( - "Computing emission probability for *mixed* negative binomial & beta binomial (sitewise)." + "Computing emission probability for *mixed* negative binomial & beta binomial (sitewise, phaseswitch)." ) n_obs = X.shape[0] @@ -224,7 +225,7 @@ def compute_emission_probability_nb_betabinom_mix( ) logger.info( - "Computed emission probability for *mixed* negative binomial & beta binomial (sitewise)." + "Computed emission probability for *mixed* negative binomial & beta binomial (sitewise, phaseswitch)." ) return log_emission_rdr, log_emission_baf From 3409014320d83f97e2388552418cb84e40d2c09b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 19:04:33 -0400 Subject: [PATCH 076/125] fix --- src/calicost/calicost_main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 12be38d..0eb2738 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -137,6 +137,8 @@ def main(configuration_file): for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c + logger.info(f"Writing initial assignment to {file_path}") + np.savez( str(file_path), **{"num_iterations": 0, "round-1_assignment": initial_assignment}, @@ -464,7 +466,7 @@ def main(configuration_file): single_base_nb_mean = copy_single_base_nb_mean n_obs = single_X.shape[0] - logger.info(f"Writing {outdir}/binned_data.npz") + logger.info(f"Writing lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, single_tumor_prop to {outdir}/binned_data.npz") np.savez( f"{outdir}/binned_data.npz", @@ -519,6 +521,8 @@ def main(configuration_file): for c, idx in enumerate(initial_clone_index): initial_assignment[idx] = c + logger.info(f"Writing initial assignment to {file_path}") + np.savez( str(file_path), **{ @@ -987,6 +991,7 @@ def main(configuration_file): return_posterior=True, ) ) + res_combine["total_llf"] = total_llf res_combine["new_assignment"] = new_assignment @@ -996,7 +1001,7 @@ def main(configuration_file): ) logger.info( - f"Writing {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz" + f"Writing likelihood and new clone assignment to {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz" ) np.savez( @@ -1004,7 +1009,7 @@ def main(configuration_file): **res_combine, ) - logger.info(f"Writing {outdir}/posterior_clone_probability.npy") + logger.info(f"Writing posterior to {outdir}/posterior_clone_probability.npy") np.save(f"{outdir}/posterior_clone_probability.npy", posterior) From f3c06bc10ad4a60361d0f43265737d966a543f44 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 19:08:13 -0400 Subject: [PATCH 077/125] fix --- src/calicost/hmm_NB_BB_phaseswitch.py | 6 ++++-- src/calicost/hmrf.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 0d08274..0aceb3f 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -1008,9 +1008,11 @@ def pipeline_baum_welch( pred = np.argmax(log_gamma, axis=0) pred_cnv = pred % n_states - # save results if not output_prefix is None: - tmp = np.log10(1 - t) + tmp = np.log10(1. - t) + + logger.info(f"Writing new parameters to {output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz") + np.savez( f"{output_prefix}_nstates{n_states}_{params}_{tmp:.0f}_seed{random_state}.npz", new_log_mu=new_log_mu, diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index fbeac27..02a0147 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -785,7 +785,7 @@ def hmrf_pipeline( res["total_llf"] = total_llf logger.info( - f"Writing HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz" + f"Writing likelihood, previous and new assignment to HMM iteration {r} to {outdir}/round{r}_nstates{n_states}_{params}.npz" ) np.savez(f"{outdir}/round{r}_nstates{n_states}_{params}.npz", **res) @@ -1071,7 +1071,6 @@ def hmrf_concatenate_pipeline( res["new_assignment"] = new_assignment res["total_llf"] = total_llf - # append to allres for k, v in res.items(): if k == "prev_assignment": allres[f"round{r-1}_assignment"] = v @@ -1083,7 +1082,7 @@ def hmrf_concatenate_pipeline( allres["num_iterations"] = r + 1 logger.info( - f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" + f"Writing assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" ) np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) @@ -1094,6 +1093,7 @@ def hmrf_concatenate_pipeline( np.where(res["new_assignment"] == c)[0] for c in np.sort(np.unique(res["new_assignment"])) ] + X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, clone_index ) @@ -1588,12 +1588,11 @@ def hmrfmix_pipeline( remaining_clones = np.sort(np.unique(new_assignment)) re_indexing = {c: i for i, c in enumerate(remaining_clones)} new_assignment = np.array([re_indexing[x] for x in new_assignment]) - # + res["prev_assignment"] = last_assignment res["new_assignment"] = new_assignment res["total_llf"] = total_llf - # append to allres for k, v in res.items(): if k == "prev_assignment": allres[f"round{r-1}_assignment"] = v @@ -1602,6 +1601,9 @@ def hmrfmix_pipeline( else: allres[f"round{r}_{k}"] = v allres["num_iterations"] = r + 1 + + logger.info("Writing assignments to {outdir}/{prefix}_nstates{n_states}_{params}.npz") + np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) logger.info(f"Found a new clone assignment for {n_spots} spots:\n{np.unique(new_assignment, return_counts=True)}") @@ -2168,7 +2170,7 @@ def hmrfmix_concatenate_pipeline( allres["num_iterations"] = r + 1 logger.info( - f"Writing HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" + f"Writing assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" ) np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) From 48a34a4193877313f881e8f07acd18c29ae5d185 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 19:15:24 -0400 Subject: [PATCH 078/125] fix --- src/calicost/hmrf.py | 45 ++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 02a0147..d0e5c94 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -64,7 +64,7 @@ def hmrf_reassignment_posterior( f"Computing hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." ) - for i in trange(N, desc="hmrf_reassignment_posterior"): + for i in range(N): idx = smooth_mat[i, :].nonzero()[1] for c in range(n_clones): tmp_log_emission_rdr, tmp_log_emission_baf = ( @@ -137,7 +137,7 @@ def hmrf_reassignment_posterior( ) logger.info( - "Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." + f"Computed hmrf_reassignment_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." ) logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") @@ -181,7 +181,7 @@ def aggr_hmrf_reassignment( "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." ) - for i in trange(N, desc="aggr_hmrf_reassignment"): + for i in range(N): idx = smooth_mat[i, :].nonzero()[1] for c in range(n_clones): @@ -239,7 +239,7 @@ def aggr_hmrf_reassignment( ) logger.info( - "Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." + f"Computed aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." ) logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") @@ -280,7 +280,7 @@ def hmrf_reassignment_posterior_concatenate( "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." ) - for i in trange(N, desc="hmrf_reassignment_posterior_concatenate"): + for i in range(N): idx = smooth_mat[i, :].nonzero()[1] tmp_log_emission_rdr, tmp_log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( @@ -355,7 +355,7 @@ def hmrf_reassignment_posterior_concatenate( ) logger.info( - "Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + f"Computed hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." ) logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") @@ -438,9 +438,9 @@ def aggr_hmrf_reassignment_concatenate( posterior = np.zeros((N, n_clones)) - for i in trange(N, desc="aggr_hmrf_reassignment_concatenate"): + for i in range(N): idx = smooth_mat[i, :].nonzero()[1] - # idx = np.append(idx, np.array([i])) + tmp_log_emission_rdr, tmp_log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( np.sum(single_X[:, :, idx], axis=2, keepdims=True), @@ -496,7 +496,7 @@ def aggr_hmrf_reassignment_concatenate( ) logger.info( - "Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + f"Computed aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." ) logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") @@ -1160,18 +1160,22 @@ def aggr_hmrfmix_reassignment( hmmclass=hmm_sitewise, return_posterior=False, ): + logger.info( + f"Computing aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) - # + lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) - # + posterior = np.zeros((N, n_clones)) - # - for i in trange(N): + + for i in range(N): idx = smooth_mat[i, :].nonzero()[1] idx = idx[~np.isnan(single_tumor_prop[idx])] for c in range(n_clones): @@ -1246,7 +1250,11 @@ def aggr_hmrfmix_reassignment( ) logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") - + + logger.info( + f"Computed aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass}." + ) + if return_posterior: return new_assignment, single_llf, total_llf, posterior else: @@ -1424,6 +1432,11 @@ def hmrfmix_pipeline( ): n_obs, _, n_spots = single_X.shape n_clones = len(initial_clone_index) + + logger.info( + f"Computing hmrfmix_pipeline for (N, n_obs, n_clones) = ({n_spots}, {n_obs}, {n_clones})." + ) + # spot adjacency matric assert not (coords is None and adjacency_mat is None) if adjacency_mat is None: @@ -1664,6 +1677,10 @@ def hmrfmix_pipeline( :, sidx ] - scipy.special.logsumexp(log_persample_weights[:, sidx]) + logger.info( + f"Computed hmrfmix_pipeline for (N, n_obs, n_clones) = ({n_spots}, {n_obs}, {n_clones})." + ) + def hmrfmix_reassignment_posterior_concatenate( single_X, From c58ea896f92607b8840a6063e6c3c4f94ed2cc3d Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 19:17:18 -0400 Subject: [PATCH 079/125] add hmrf logging --- src/calicost/hmrf.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index d0e5c94..5963aa2 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1845,6 +1845,11 @@ def aggr_hmrfmix_reassignment_concatenate( n_obs = single_X.shape[0] n_clones = int(len(pred) / n_obs) n_states = res["new_p_binom"].shape[0] + + logger.info( + f"Computing aggr_hmrfmix_reassignment_concatenate for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) + single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) @@ -1925,6 +1930,10 @@ def aggr_hmrfmix_reassignment_concatenate( ) ) + logger.info( + f"Computed aggr_hmrfmix_reassignment_concatenate for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) + logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") if return_posterior: From c7d09d9f9820b51e37ccc0475934b795e73744c7 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 19:49:28 -0400 Subject: [PATCH 080/125] add cpas --- src/calicost/calicost_main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 0eb2738..4975141 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -333,7 +333,7 @@ def main(configuration_file): ) logger.info( - "Preparing refinement of initial, merged clones using BAF & RDR ****" + "**** Preparing refinement of initial, merged clones using BAF & RDR ****" ) if not config["bafonly"]: @@ -481,7 +481,7 @@ def main(configuration_file): ) logger.info( - f"**** Refining initial, merged clones (N={n_baf_clones}) using BAF & RDR ****" + f"**** REFINING INITIAL, MERGED CLONES (N={n_baf_clones}) USING BAF & RDR ****" ) for bafc in range(n_baf_clones): From fbfe56b1df470eff18b643952498f48e68efb53c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 20:05:00 -0400 Subject: [PATCH 081/125] log phasing baum welch. --- src/calicost/parse_input.py | 24 ++++++++---------------- src/calicost/phasing.py | 20 +++++++++++++++++++- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py index f723e4a..16af14a 100644 --- a/src/calicost/parse_input.py +++ b/src/calicost/parse_input.py @@ -160,6 +160,7 @@ def parse_visium(config): logphase_shift=config["logphase_shift"], geneticmap_file=config["geneticmap_file"], ) + # infer an initial phase using pseudobulk if not Path(f"{config['output_dir']}/initial_phase.npz").exists(): initial_clone_for_phasing = perform_partition( @@ -170,6 +171,7 @@ def parse_visium(config): single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"], ) + phase_indicator, refined_lengths = initial_phase_given_partition( single_X, lengths, @@ -190,11 +192,15 @@ def parse_visium(config): 1e-3, threshold=config["tumorprop_threshold"], ) + + logger.info(f"Writing initial pase to {config['output_dir']}/initial_phase.npz") + np.savez( f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths, ) + # map phase indicator to individual snps df_gene_snp["phase"] = np.where( df_gene_snp.snp_id.isnull(), @@ -228,21 +234,6 @@ def parse_visium(config): logphase_shift=config["logphase_shift"], geneticmap_file=config["geneticmap_file"], ) - # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \ - # single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi) - - # # remove bins where normal spots have imbalanced SNPs - # if not config["tumorprop_file"] is None: - # for prop_threshold in np.arange(0, 0.6, 0.05): - # normal_candidate = (single_tumor_prop <= prop_threshold) - # if np.sum(single_X[:, 0, (normal_candidate==True)]) > single_X.shape[0] * 200: - # break - # index_normal = np.where(normal_candidate)[0] - # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ - # single_X, single_base_nb_mean, single_total_bb_RD, config["nu"], config["logphase_shift"], index_normal, config['geneticmap_file']) - # assert np.sum(lengths) == single_X.shape[0] - # assert single_X.shape[0] == single_total_bb_RD.shape[0] - # assert single_X.shape[0] == len(log_sitewise_transmat) # expression count dataframe exp_counts = pd.DataFrame.sparse.from_spmatrix( @@ -263,9 +254,10 @@ def parse_visium(config): maxspots_pooling=config["maxspots_pooling"], construct_adjacency_w=config["construct_adjacency_w"], ) + n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten()) - logger.info(f"Set up number of spots to pool in HMRF: {n_pooled}") + logger.info(f"Set up number of spots to pool for HMRF: {n_pooled}") # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data) # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py index e4c9447..385a8be 100644 --- a/src/calicost/phasing.py +++ b/src/calicost/phasing.py @@ -102,11 +102,19 @@ def initial_phase_given_partition( threshold, min_snpumi=2e3, ): + + n_obs, _, n_spots = single_X.shape + + logger.info(f"Computing initial_phase_given_partition for (n_states, n_obs, n_spots) = ({n_states}, {n_obs}, {n_spots}).") + + # TODO HARDCODE EPS_BAF = 0.05 + if single_tumor_prop is None: X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index ) + tumor_prop = None else: X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( @@ -121,6 +129,7 @@ def initial_phase_given_partition( # pseudobulk HMM for phase_prob baf_profiles = np.zeros((X.shape[2], X.shape[0])) pred_cnv = np.zeros((X.shape[2], X.shape[0])) + for i in range(X.shape[2]): if np.sum(total_bb_RD[:, i]) < min_snpumi: baf_profiles[i, :] = 0.5 @@ -150,7 +159,7 @@ def initial_phase_given_partition( max_iter=max_iter, tol=tol, ) - # + pred = np.argmax(res["log_gamma"], axis=0) this_baf_profiles = np.where( pred < n_states, @@ -180,10 +189,12 @@ def initial_phase_given_partition( ) @ baf_profiles ) + adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1 - baf_profiles) phase_indicator = population_baf < 0.5 refined_lengths = [] cumlen = 0 + for le in lengths: s = 0 for i in range(le): @@ -199,14 +210,19 @@ def initial_phase_given_partition( refined_lengths.append(le - s) cumlen += le refined_lengths = np.array(refined_lengths) + + logger.info(f"Computed initial_phase_given_partition.") + return phase_indicator, refined_lengths def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, threshold): initial_clone_index = [] + for s in range(np.max(sample_ids) + 1): index = np.where(sample_ids == s)[0] assert len(index) > 0 + if single_tumor_prop is None: tmp_clone_index = fixed_rectangle_initialization( coords[index, :], x_part, y_part @@ -219,6 +235,8 @@ def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, thr single_tumor_prop[index], threshold=threshold, ) + for x in tmp_clone_index: initial_clone_index.append(index[x]) + return initial_clone_index From 8969c351dfe2bec34566735b12ed31daadb7bfd0 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 20:13:51 -0400 Subject: [PATCH 082/125] fix --- src/calicost/calicost_main.py | 2 +- src/calicost/phasing.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 4975141..6f7473d 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -84,7 +84,7 @@ def main(configuration_file): exp_counts, ) = run_parse_n_load(config) - logger.info(f"**** Estimating initial clones using BAF only ****") + logger.info(f"**** ESTIMATING INITIAL CLONES USING BAF ONLY ****") # NB setting transcript & baseline count to 0 so the emission probability will be ignored. copy_single_X_rdr = copy.copy(single_X[:, 0, :]) diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py index 385a8be..745f4eb 100644 --- a/src/calicost/phasing.py +++ b/src/calicost/phasing.py @@ -104,7 +104,8 @@ def initial_phase_given_partition( ): n_obs, _, n_spots = single_X.shape - + + logger.info(f"**** COMPUTING INITIAL PHASE ****") logger.info(f"Computing initial_phase_given_partition for (n_states, n_obs, n_spots) = ({n_states}, {n_obs}, {n_spots}).") # TODO HARDCODE From 726bc11b118e4960fa03aa4d612f912c8cabe27c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 20:39:41 -0400 Subject: [PATCH 083/125] log initial alpha and tau --- src/calicost/hmm_NB_BB_nophasing_v2.py | 3 +++ src/calicost/hmm_NB_BB_phaseswitch.py | 12 ++++++++---- src/calicost/utils_IO.py | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index a360736..e851014 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -361,6 +361,9 @@ def run_baum_welch_nb_bb( taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) + + logger.info(f"Initial alphas:\n{alphas}") + logger.info(f"Initial taus:\n{taus}") logger.info(f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 0aceb3f..0293cf5 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -394,12 +394,9 @@ def run_baum_welch_nb_bb( n_obs = X.shape[0] n_comp = X.shape[1] n_spots = X.shape[2] + assert n_comp == 2 - logger.info( - "Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise)." - ) - log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None @@ -417,6 +414,13 @@ def run_baum_welch_nb_bb( ) taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus + logger.info(f"Initial alphas:\n{alphas}") + logger.info(f"Initial taus:\n{taus}") + + logger.info( + "Initialize Baum Welch NB logmean shift, BetaBinom prob and dispersion param inverse (sitewise)." + ) + # NB initialize start probability and emission probability log_startprob = np.log(np.ones(n_states) / n_states) if n_states > 1: diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index 2f0ee47..6e38ff8 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -1522,7 +1522,7 @@ def bin_selection_basedon_normal( min_betabinom_tau=30, ): """ - Filter out bins that potential contain somatic mutations based on BAF of normal spots. + Filter out bins that potentially contain somatic mutations based on BAF of normal spots. """ # pool B allele counts for each bin across all normal spots tmpX = np.sum(single_X[:, 1, index_normal], axis=1) From 6f85e89e0bee44a088111a9a6cf82a4ba923909d Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 20:53:22 -0400 Subject: [PATCH 084/125] fix --- src/calicost/calicost_main.py | 2 +- src/calicost/hmrf.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 6f7473d..1ef0c62 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -485,7 +485,7 @@ def main(configuration_file): ) for bafc in range(n_baf_clones): - logger.info(f"Refining BAF clone {bafc}.") + logger.info(f"**** Refining BAF clone {bafc} ****") prefix = f"clone{bafc}" idx_spots = np.where(merged_baf_assignment == bafc)[0] diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 5963aa2..0a5b853 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -1815,8 +1815,9 @@ def hmrfmix_reassignment_posterior_concatenate( ) ) + unique_assignment, cnts = np.unique(new_assignment, return_counts=True) + logger.info(f"Computed hmrfmix_reassignment_posterior_concatenate.") - logger.info(f"Found a new clone assignment for N={N}:\n{np.unique(new_assignment, return_counts=True)}") if return_posterior: From 160ca9c694976846fe0e39c5c34d841d4f762b6f Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 21:03:12 -0400 Subject: [PATCH 085/125] fix --- src/calicost/calicost_main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 1ef0c62..d1619af 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -596,7 +596,7 @@ def main(configuration_file): tumorprop_threshold=config["tumorprop_threshold"], ) - logger.info(f"Combining results across clones.") + logger.info(f"**** REFINED CLONES BY RDR. COMBINING RESULTS ACROSS CLONES. ****") res_combine = {"prev_assignment": np.zeros(single_X.shape[2], dtype=int)} offset_clone = 0 @@ -672,7 +672,7 @@ def main(configuration_file): tumor_prop = np.repeat(tumor_prop, X.shape[0]).reshape(-1, 1) logger.info( - f"Merging BAF+RDR clones based on Neyman-Pearson Likelihood ratio." + f"**** MERGING BAF+RDR CLONES BASED ON NEYMAN-PEARSON LIKELIHOOD RATIO ****" ) merging_groups, merged_res = ( @@ -750,7 +750,7 @@ def main(configuration_file): ) logger.info( - f"Running Baum-Welch with refined & merged BAF+RDR clones." + f"**** EVALUATING BAUM-WELCH WITH REFINED & MERGED BAF+RDR CLONES ****" ) merged_res = pipeline_baum_welch( @@ -1000,6 +1000,10 @@ def main(configuration_file): res_combine, posterior, single_tumor_prop ) + logger.info( + f"**** EVALUATED BAUM-WELCH WITH REFINED & MERGED BAF+RDR CLONES ****" + ) + logger.info( f"Writing likelihood and new clone assignment to {outdir}/rdrbaf_final_nstates{config['n_states']}_smp.npz" ) From 4a8103f76cd7d78ae465ae869eecce0293cc1b38 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 21:13:04 -0400 Subject: [PATCH 086/125] fix --- src/calicost/utils_IO.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index 6e38ff8..801049c 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -114,8 +114,8 @@ def load_data( ).A.flatten() genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] - print(adata) - print( + logger.info(adata) + logger.info( "median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) @@ -130,7 +130,7 @@ def load_data( [(not x in filter_gene_list) for x in adata.var.index] ) adata = adata[:, indicator_filter] - print( + logger.info( "median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) @@ -172,13 +172,13 @@ def load_data( clf = LocalOutlierFactor(n_neighbors=200) label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1)) adata.layers["count"][:, np.where(label == -1)[0]] = 0 - print("filter out {} outlier genes.".format(np.sum(label == -1))) + logger.info("filter out {} outlier genes.".format(np.sum(label == -1))) if not normalidx_file is None: normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values adata.obs["tumor_annotation"] = "tumor" adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal" - print(adata.obs["tumor_annotation"].value_counts()) + logger.info(adata.obs["tumor_annotation"].value_counts()) return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids @@ -374,8 +374,8 @@ def load_joint_data( ).A.flatten() genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] - print(adata) - print( + logger.info(adata) + logger.info( "median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) @@ -388,7 +388,7 @@ def load_joint_data( [(not x in filter_gene_list) for x in adata.var.index] ) adata = adata[:, indicator_filter] - print( + logger.info( "median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) @@ -430,13 +430,13 @@ def load_joint_data( clf = LocalOutlierFactor(n_neighbors=200) label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1)) adata.layers["count"][:, np.where(label == -1)[0]] = 0 - print("filter out {} outlier genes.".format(np.sum(label == -1))) + logger.info("filter out {} outlier genes.".format(np.sum(label == -1))) if not normalidx_file is None: normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values adata.obs["tumor_annotation"] = "tumor" adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal" - print(adata.obs["tumor_annotation"].value_counts()) + logger.info(adata.obs["tumor_annotation"].value_counts()) return ( adata, @@ -548,8 +548,8 @@ def filter_genes_barcodes_hatchetblock( ).A.flatten() genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] - print(adata) - print( + logger.info(adata) + logger.info( "median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) @@ -562,7 +562,7 @@ def filter_genes_barcodes_hatchetblock( [(not x in filter_gene_list) for x in adata.var.index] ) adata = adata[:, indicator_filter] - print( + logger.info( "median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) @@ -1680,7 +1680,7 @@ def filter_de_genes( ) ) filtered_out_set = filtered_out_set | this_filtered_out_set - print(f"Filter out {len(filtered_out_set)} DE genes") + logger.info(f"Filter out {len(filtered_out_set)} DE genes") # new_single_X_rdr = np.zeros((len(x_gene_list), adata.shape[0])) for i, x in enumerate(x_gene_list): From 575ca764c0917b1aba4ff415771904b5046343be Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 18 Aug 2024 21:31:54 -0400 Subject: [PATCH 087/125] fix --- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmm_NB_BB_phaseswitch.py | 2 +- src/calicost/hmrf.py | 25 +++++++++++++------------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index e851014..b3ed727 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -593,7 +593,7 @@ def run_baum_welch_nb_bb( p_binom = new_p_binom taus = new_taus - logger.info("Computed Baum-Welch (v2) in {r+1} iterations.") + logger.info(f"Computed Baum-Welch (v2) in {r+1} iterations.") logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 0293cf5..b32191b 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -587,7 +587,7 @@ def run_baum_welch_nb_bb( p_binom = new_p_binom taus = new_taus - logger.info("Computed Baum-Welch (sitewise) in {r+1} iterations.") + logger.info(f"Computed Baum-Welch (sitewise) in {r+1} iterations.") logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 0a5b853..b6be6ca 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -66,6 +66,7 @@ def hmrf_reassignment_posterior( for i in range(N): idx = smooth_mat[i, :].nonzero()[1] + for c in range(n_clones): tmp_log_emission_rdr, tmp_log_emission_baf = ( hmmclass.compute_emission_probability_nb_betabinom( @@ -119,7 +120,7 @@ def hmrf_reassignment_posterior( if new_assignment[j] >= 0: w_edge[new_assignment[j]] += adjacency_mat[i, j] new_assignment[i] = np.argmax(w_node + spatial_weight * w_edge) - # + posterior[i, :] = np.exp( w_node + spatial_weight * w_edge @@ -128,6 +129,7 @@ def hmrf_reassignment_posterior( # compute total log likelihood log P(X | Z) + log P(Z) total_llf = np.sum(single_llf[np.arange(N), new_assignment]) + for i in range(N): total_llf += np.sum( spatial_weight @@ -178,7 +180,7 @@ def aggr_hmrf_reassignment( posterior = np.zeros((N, n_clones)) logger.info( - "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass}." + "Computing aggr_hmrf_posterior with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." ) for i in range(N): @@ -277,7 +279,7 @@ def hmrf_reassignment_posterior_concatenate( posterior = np.zeros((N, n_clones)) logger.info( - "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." + "Computing hmrf_reassignment_posterior_concatenate with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." ) for i in range(N): @@ -424,11 +426,6 @@ def aggr_hmrf_reassignment_concatenate( total_llf : float The HMRF objective, which is the sum of log likelihood under the optimal labels plus the sum of edge potentials. """ - - logger.info( - "Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass}." - ) - N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = int(len(pred) / n_obs) @@ -436,6 +433,10 @@ def aggr_hmrf_reassignment_concatenate( single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) + logger.info( + "Computing aggr_hmrf_reassignment_concatenate with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) + posterior = np.zeros((N, n_clones)) for i in range(N): @@ -1160,16 +1161,16 @@ def aggr_hmrfmix_reassignment( hmmclass=hmm_sitewise, return_posterior=False, ): - logger.info( - f"Computing aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass}." - ) - N = single_X.shape[2] n_obs = single_X.shape[0] n_clones = res["new_log_mu"].shape[1] n_states = res["new_p_binom"].shape[0] single_llf = np.zeros((N, n_clones)) new_assignment = copy.copy(prev_assignment) + + logger.info( + f"Computing aggr_hmrfmix_reassignment with compute_emission_probability_nb_betabinom of {hmmclass} for (N, n_obs, n_clones, n_states) = ({N}, {n_obs}, {n_clones}, {n_states})." + ) lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean) From 75a913279074a8657c96b0d67fdf04f05933008e Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 19 Aug 2024 06:44:04 -0400 Subject: [PATCH 088/125] fix --- src/calicost/hmrf.py | 6 +++--- src/calicost/utils_IO.py | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index b6be6ca..425c8c7 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -661,7 +661,7 @@ def hmrf_pipeline( sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) - logger.info("Merging pseudobulk by clone index") + logger.info("Merging pseudobulk based on clone index") X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index @@ -907,7 +907,7 @@ def hmrf_concatenate_pipeline( log_persample_weights = np.ones((n_clones, n_samples)) * np.log(n_clones) - logger.info("Merging pseudobulk by clone index") + logger.info("Merging pseudobulk based on clone index") X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index( single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index @@ -2000,7 +2000,7 @@ def hmrfmix_concatenate_pipeline( sample_ids = np.array([tmp_map_index[x] for x in sample_ids]) log_persample_weights = np.ones((n_clones, n_samples)) * (-np.log(n_clones)) - logger.info("Merging pseudobulk by clone index") + logger.info("Merging pseudobulk based on clone index") X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix( single_X, diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index 801049c..b3a6e4a 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -199,8 +199,8 @@ def load_joint_data( columns=dict(zip(df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"])), inplace=True, ) - logger.info(f"Input spaceranger file list {input_filelist} contains:") - logger.info(df_meta) + logger.info(f"Input spaceranger file list {input_filelist} contains:\n{df_meta}") + df_barcode = pd.read_csv( f"{snp_dir}/barcodes.txt", header=None, names=["combined_barcode"] ) @@ -376,7 +376,7 @@ def load_joint_data( adata = adata[:, indicator] logger.info(adata) logger.info( - "median UMI after filtering out genes < 0.5% of cells = {}".format( + "Median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) ) @@ -389,7 +389,7 @@ def load_joint_data( ) adata = adata[:, indicator_filter] logger.info( - "median UMI after filtering out genes in filtergenelist_file = {}".format( + "Median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) ) ) @@ -430,7 +430,8 @@ def load_joint_data( clf = LocalOutlierFactor(n_neighbors=200) label = clf.fit_predict(np.sum(adata.layers["count"], axis=0).reshape(-1, 1)) adata.layers["count"][:, np.where(label == -1)[0]] = 0 - logger.info("filter out {} outlier genes.".format(np.sum(label == -1))) + + logger.info("Filter out {} outlier genes.".format(np.sum(label == -1))) if not normalidx_file is None: normal_barcodes = pd.read_csv(normalidx_file, header=None).iloc[:, 0].values From 35ee3427e77da8851208772f4295132336940616 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 19 Aug 2024 07:49:05 -0400 Subject: [PATCH 089/125] fix --- src/calicost/hmm_NB_BB_nophasing_v2.py | 1 + src/calicost/hmrf.py | 2 +- src/calicost/utils_hmm.py | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index b3ed727..eb6bfd5 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -389,6 +389,7 @@ def run_baum_welch_nb_bb( ) for r in range(max_iter): + logger.info("-" * 250) logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.") if tumor_prop is None: diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 425c8c7..9e3263c 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -2198,7 +2198,7 @@ def hmrfmix_concatenate_pipeline( allres["num_iterations"] = r + 1 logger.info( - f"Writing assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" + f"Writing round ({r}, {k}) assignments for HMM iteration {r} to {outdir}/{prefix}_nstates{n_states}_{params}.npz" ) np.savez(f"{outdir}/{prefix}_nstates{n_states}_{params}.npz", **allres) diff --git a/src/calicost/utils_hmm.py b/src/calicost/utils_hmm.py index e953f90..a731e19 100644 --- a/src/calicost/utils_hmm.py +++ b/src/calicost/utils_hmm.py @@ -1363,7 +1363,7 @@ def update_emission_params_bb_sitewise_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -1842,7 +1842,7 @@ def update_emission_params_nb_nophasing_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for Negative Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Negative Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -2043,7 +2043,7 @@ def update_emission_params_bb_nophasing_uniqvalues( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for Beta Binomial custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): @@ -2273,7 +2273,7 @@ def update_emission_params_bb_nophasing_uniqvalues_mix( nloglikeobs2 = model.nloglikeobs(res2.params) nloglikeobs = model.nloglikeobs(res.params) - logger.info(f"Comparing loglike for Beta Binomial Mix custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") + logger.info(f"Comparing loglike for Beta Binomial Mix with custom start {nloglikeobs2:.6e} to default start {nloglikeobs:.6e}.") if nloglikeobs2 < nloglikeobs: for s, idx_state_posweight in enumerate(state_posweights): From 0b8720a2519e366046ec05bc057ed3d486a065fc Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Tue, 20 Aug 2024 10:19:22 -0400 Subject: [PATCH 090/125] write Weighted Beta Binom chain file --- src/calicost/utils_distribution_fitting.py | 84 +++++++++++++++++----- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 1ca6d9a..6db5a15 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -1,22 +1,23 @@ +import contextlib import functools import inspect import logging +import os +import sys +import time import numpy as np import scipy -import time -from scipy import linalg, special -from scipy.special import logsumexp, loggamma import scipy.integrate import scipy.stats +import statsmodels +import statsmodels.api as sm from numba import jit, njit +from scipy import linalg, special +from scipy.special import loggamma, logsumexp from sklearn import cluster from sklearn.utils import check_random_state -import statsmodels -import statsmodels.api as sm from statsmodels.base.model import GenericLikelihoodModel -import os - logger = logging.getLogger(__name__) @@ -40,6 +41,20 @@ def convert_params(mean, std): return n, p +@contextlib.contextmanager +def save_stdout(fpath): + original = sys.stdout + + with open(fpath, "w") as ff: + sys.stdout = ff + try: + yield + + # NB teardown + finally: + sys.stdout = original + + class Weighted_NegativeBinomial(GenericLikelihoodModel): """ Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion. @@ -170,7 +185,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): return result - + class Weighted_BetaBinom(GenericLikelihoodModel): """ Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. @@ -190,6 +205,8 @@ class Weighted_BetaBinom(GenericLikelihoodModel): exposure : array, (n_samples,) Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. """ + ninstance = 0 + def __init__(self, endog, exog, weights, exposure, **kwds): super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds) @@ -198,12 +215,25 @@ def __init__(self, endog, exog, weights, exposure, **kwds): self.weights = weights self.exposure = exposure + # NB update the instance count + Weighted_BetaBinom.ninstance += 1 + + def nloglikeobs(self, params): a = (self.exog @ params[:-1]) * params[-1] b = (1 - self.exog @ params[:-1]) * params[-1] return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + def callback(self, params): + nloglike = self.nloglikeobs(params) + + print(params, nloglike) + + @classmethod + def get_ninstance(cls): + return cls.ninstance + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): self.exog_names.append("tau") @@ -223,16 +253,33 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): start = time.time() - result = super(Weighted_BetaBinom, self).fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=None, - full_output=True, - retall=False, - **kwds - ) + # NB kwds = {'xtol': 0.0001, 'ftol': 0.0001, disp: False} + kwds.pop("disp", None) + + with save_stdout("weighted_betabinom_chain.tmp"): + result = super(Weighted_BetaBinom, self).fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + skip_hessian=True, + callback=self.callback, + full_output=True, + retall=True, + disp=False, + **kwds + ) + + with open("weighted_betabinom_chain.tmp") as fin: + with open("weighted_betabinom_chain.txt", "w") as fout: + fout.write(f"# Weighted_BetaBinom {Weighted_BetaBinom.get_ninstance()} @ {time.asctime()}:\n") + fout.write(f"start_type={start_params_str}, shape={self.endog.shape[0]}" + ", ".join(f"{key}: {value}" for key, value in result.mle_retvals.items())) + + for line in fin: + fout.write(line) + + os.remove("weighted_betabinom_chain.tmp") + + breakpoint() # NB specific to nm (Nelder-Mead) optimization. niter = result.mle_retvals["iterations"] @@ -241,6 +288,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): return result + class Weighted_BetaBinom_mix(GenericLikelihoodModel): def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds): super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds) From 137e4bbc26f92d42c8fdf73b95af57dd7e96d21c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Tue, 20 Aug 2024 11:30:00 -0400 Subject: [PATCH 091/125] fix --- src/calicost/utils_distribution_fitting.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 6db5a15..be6ccc9 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -221,14 +221,14 @@ def __init__(self, endog, exog, weights, exposure, **kwds): def nloglikeobs(self, params): a = (self.exog @ params[:-1]) * params[-1] - b = (1 - self.exog @ params[:-1]) * params[-1] + b = (1. - self.exog @ params[:-1]) * params[-1] return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) def callback(self, params): nloglike = self.nloglikeobs(params) - print(params, nloglike) + print(params, nloglike, ";") @classmethod def get_ninstance(cls): @@ -269,17 +269,20 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): **kwds ) + ninst = Weighted_BetaBinom.get_ninstance() + + # TODO mkdir chains with open("weighted_betabinom_chain.tmp") as fin: - with open("weighted_betabinom_chain.txt", "w") as fout: - fout.write(f"# Weighted_BetaBinom {Weighted_BetaBinom.get_ninstance()} @ {time.asctime()}:\n") - fout.write(f"start_type={start_params_str}, shape={self.endog.shape[0]}" + ", ".join(f"{key}: {value}" for key, value in result.mle_retvals.items())) + with open(f"chains/weighted_betabinom_chain_{ninst}.txt", "w") as fout: + fout.write(f"# Weighted_BetaBinom {ninst} @ {time.asctime()}\n") + fout.write(f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," + ",".join(f"{key}:{value}" for key, value in result.mle_retvals.items()) + "\n") for line in fin: fout.write(line) os.remove("weighted_betabinom_chain.tmp") - breakpoint() + # breakpoint() # NB specific to nm (Nelder-Mead) optimization. niter = result.mle_retvals["iterations"] From 3d46a85b0c01a018769cf6074e455b77b28bce6f Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Tue, 20 Aug 2024 11:43:37 -0400 Subject: [PATCH 092/125] finishing touches --- src/calicost/utils_distribution_fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index be6ccc9..ea593a4 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -228,7 +228,7 @@ def nloglikeobs(self, params): def callback(self, params): nloglike = self.nloglikeobs(params) - print(params, nloglike, ";") + print(f"{params} {nloglike};") @classmethod def get_ninstance(cls): From 9ac416b7da1368eff1567d4c3289f4378fe05cc6 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Sun, 25 Aug 2024 22:28:05 -0400 Subject: [PATCH 093/125] ABC for emission models --- src/calicost/utils_distribution_fitting.py | 376 ++++++--------------- 1 file changed, 112 insertions(+), 264 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index ea593a4..6551ee1 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -18,6 +18,7 @@ from sklearn import cluster from sklearn.utils import check_random_state from statsmodels.base.model import GenericLikelihoodModel +from abc import ABC, abstractmethod logger = logging.getLogger(__name__) @@ -54,92 +55,132 @@ def save_stdout(fpath): finally: sys.stdout = original - -class Weighted_NegativeBinomial(GenericLikelihoodModel): +class WeightedModel(GenericLikelihoodModel, ABC): """ - Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion. - This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) - - Attributes - ---------- - endog : array, (n_samples,) - Y values. - - exog : array, (n_samples, n_features) - Design matrix. - - weights : array, (n_samples,) - Sample weights. - - exposure : array, (n_samples,) + An ABC for defined emission models. + + Attributes + ---------- + endog : array, (n_samples,) + Y values. + exog : array, (n_samples, n_features) + Design matrix. + + weights : array, (n_samples,) + Sample weights. + exposure : array, (n_samples,) Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ - def __init__(self, endog, exog, weights, exposure, seed=0, **kwds): - super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds) - - logger.info(f"Initializing Weighted_NegativeBinomial model for endog.shape = {endog.shape}.") - + def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwds): + super(WeightedModel, self).__init__(endog, exog, **kwds) + self.weights = weights self.exposure = exposure + + # NB Weight_BetaBinomial does not specify seed self.seed = seed + self.tumor_prop = tumor_prop - def nloglikeobs(self, params): - nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure - nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) + self.__post_init__() + + logger.info(f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}.") - n, p = convert_params(nb_mean, nb_std) + @abstractmethod + def nloglikeobs(self, params): + pass - return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) + @abstractmethod + def get_default_start_params(self): + pass + + @abstractmethod + def get_ext_param_name(self): + pass + + @classmethod + @abstractmethod + def get_ninstance(cls): + pass + + @abstractmethod + def __post_init__(self): + # NB will increment the instance count for each derived class. + pass + + def callback(self, params): + nloglike = self.nloglikeobs(params) + print(f"{params} {nloglike};") + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): - self.exog_names.append("alpha") + ext_param_name = self.get_ext_param_name() + self.exog_names.append(ext_param_name) + if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params start_params_str = "existing" - else: - start_params = np.append(0.1 * np.ones(self.nparams), 0.01) + start_params = self.default_start_params() start_params_str = "default" else: start_params_str = "input" - logger.info(f"Starting Weighted_NegativeBinomial optimization @ ({start_params_str}) {start_params}.") - + logger.info(f"Starting {self.__class__.__name__} optimization @ ({start_params_str}) {start_params}.") + start = time.time() - # NB see https://www.statsmodels.org/dev/dev/generated/statsmodels.base.model.LikelihoodModelResults.html - result = super(Weighted_NegativeBinomial, self).fit( + result = super(Weighted_Model, self).fit( start_params=start_params, maxiter=maxiter, maxfun=maxfun, skip_hessian=True, - callback=None, + callback=self.callback, full_output=True, - retall=False, + retall=True, + disp=False, **kwds ) - # NB specific to nm (Nelder-Mead) optimization. + # NB specific to nm (Nelder-Mead) optimization. niter = result.mle_retvals["iterations"] - logger.info(f"Finished Weighted_NegativeBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + logger.info(f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") return result +class Weighted_NegativeBinomial(WeightedModel): + """ + Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion. + This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) + """ + ninstance = 0 + + def nloglikeobs(self, params): + nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure + nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) + + n, p = convert_params(nb_mean, nb_std) -class Weighted_NegativeBinomial_mix(GenericLikelihoodModel): - def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds): - super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds) + return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) - logger.info(f"Initializing Weighted_NegativeBinomial_mix model for endog.shape = {endog.shape}.") + def get_default_start_params(self): + return np.append(0.1 * np.ones(self.exog.shape[1]), 0.01) - self.weights = weights - self.exposure = exposure - self.seed = seed - self.tumor_prop = tumor_prop + def get_ext_param_name(): + return "alpha" + def __post_init__(self): + Weighted_NegativeBinomial.ninstance += 1 + + @classmethod + def get_ninstance(cls): + return cls.ninstance + +class Weighted_NegativeBinomial_mix(WeightedModel): + ninstance = 0 + def nloglikeobs(self, params): nb_mean = self.exposure * ( self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop @@ -149,159 +190,38 @@ def nloglikeobs(self, params): n, p = convert_params(nb_mean, nb_std) return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) + + def get_default_start_params(self): + return np.append(0.1 * np.ones(self.nparams), 0.01) - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): - self.exog_names.append("alpha") - - if start_params is None: - if hasattr(self, "start_params"): - start_params = self.start_params - start_params_str = "existing" - else: - start_params = np.append(0.1 * np.ones(self.nparams), 0.01) - start_params_str = "default" - else: - start_params_str = "input" - - logger.info(f"Starting Weighted_NegativeBinomial_mix optimization @ ({start_params_str}) {start_params}.") - - start = time.time() - - result = super(Weighted_NegativeBinomial_mix, self).fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=None, - full_output=True, - retall=False, - **kwds - ) - - # NB specific to nm (Nelder-Mead) optimization. - niter = result.mle_retvals["iterations"] - - logger.info(f"Finished Weighted_NegativeBinomial_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") - - return result + def get_ext_param_name(self): + return "alpha" - -class Weighted_BetaBinom(GenericLikelihoodModel): + def __post_init__(self): + assert self.tumor_prop is not None, "Tumor proportion must be defined." + +class Weighted_BetaBinom(WeightedModel): """ Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) - - Attributes - ---------- - endog : array, (n_samples,) - Y values. - - exog : array, (n_samples, n_features) - Design matrix. - - weights : array, (n_samples,) - Sample weights. - - exposure : array, (n_samples,) - Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. """ ninstance = 0 - def __init__(self, endog, exog, weights, exposure, **kwds): - super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds) - - logger.info(f"Initializing Weighted_BetaBinomial model for endog.shape = {endog.shape}.") - - self.weights = weights - self.exposure = exposure - - # NB update the instance count - Weighted_BetaBinom.ninstance += 1 - - def nloglikeobs(self, params): a = (self.exog @ params[:-1]) * params[-1] b = (1. - self.exog @ params[:-1]) * params[-1] return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) - def callback(self, params): - nloglike = self.nloglikeobs(params) - - print(f"{params} {nloglike};") - - @classmethod - def get_ninstance(cls): - return cls.ninstance - - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): - self.exog_names.append("tau") - - if start_params is None: - if hasattr(self, "start_params"): - start_params = self.start_params - start_params_str = "existing" - else: - start_params = np.append( - 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 - ) - start_params_str = "default" - else: - start_params_str = "input" - - logger.info(f"Starting Weighted_BetaBinomial optimization @ ({start_params_str}) {start_params}.") - - start = time.time() - - # NB kwds = {'xtol': 0.0001, 'ftol': 0.0001, disp: False} - kwds.pop("disp", None) - - with save_stdout("weighted_betabinom_chain.tmp"): - result = super(Weighted_BetaBinom, self).fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=self.callback, - full_output=True, - retall=True, - disp=False, - **kwds - ) - - ninst = Weighted_BetaBinom.get_ninstance() - - # TODO mkdir chains - with open("weighted_betabinom_chain.tmp") as fin: - with open(f"chains/weighted_betabinom_chain_{ninst}.txt", "w") as fout: - fout.write(f"# Weighted_BetaBinom {ninst} @ {time.asctime()}\n") - fout.write(f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," + ",".join(f"{key}:{value}" for key, value in result.mle_retvals.items()) + "\n") - - for line in fin: - fout.write(line) - - os.remove("weighted_betabinom_chain.tmp") - - # breakpoint() - - # NB specific to nm (Nelder-Mead) optimization. - niter = result.mle_retvals["iterations"] - - logger.info(f"Finished Weighted_BetaBinomial optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") - - return result + def get_default_start_params(self): + return np.append( + 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 + ) + def get_ext_param_name(): + return "tau" -class Weighted_BetaBinom_mix(GenericLikelihoodModel): - def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds): - super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds) - - logger.info(f"Initializing Weighted_BetaBinom_mix model for endog.shape = {endog.shape}.") - - self.weights = weights - self.exposure = exposure - self.tumor_prop = tumor_prop - +class Weighted_BetaBinom_mix(WeightedModel_mix): def nloglikeobs(self, params): a = ( self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop) @@ -314,44 +234,17 @@ def nloglikeobs(self, params): return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): - self.exog_names.append("tau") - - if start_params is None: - if hasattr(self, "start_params"): - start_params = self.start_params - start_params_str = "existing" - else: - start_params = np.append( - 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 - ) - start_params_str = "default" - else: - start_params_str = "input" - - logger.info(f"Starting Weighted_BetaBinom_mix optimization @ ({start_params_str}) {start_params}.") - - start = time.time() - - result = super(Weighted_BetaBinom_mix, self).fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=None, - full_output=True, - retall=False, - **kwds + def get_default_start_params(self): + return np.append( + 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 ) + + def get_ext_param_name(): + return "tau" - # NB specific to nm (Nelder-Mead) optimization. - niter = result.mle_retvals["iterations"] - - logger.info(f"Finished Weighted_BetaBinom_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") - - return result - - + def __post_init__(self): + assert self.tumor_prop is not None, "Tumor proportion must be defined." + class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): def __init__(self, endog, exog, tau, weights, exposure, **kwds): super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds) @@ -456,48 +349,3 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): logger.info(f"Finished Weighted_BetaBinom_fixdispersion_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") return result - -# DEPRECATE -class BAF_Binom(GenericLikelihoodModel): - """ - Binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. - This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) - - Attributes - ---------- - endog : array, (n_samples,) - Y values. - - exog : array, (n_samples, n_features) - Design matrix. - - weights : array, (n_samples,) - Sample weights. - - exposure : array, (n_samples,) - Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. - """ - def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds): - super(BAF_Binom, self).__init__(endog, exog, **kwds) - - self.weights = weights - self.exposure = exposure - self.offset = offset - self.scaling = scaling - - def nloglikeobs(self, params): - linear_term = self.exog @ params - p = self.scaling / (1 + np.exp(-linear_term + self.offset)) - - return -scipy.stats.binom.logpmf(self.endog, self.exposure, p).dot(self.weights) - - def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): - if start_params is None: - if hasattr(self, "start_params"): - start_params = self.start_params - else: - start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams) - - return super(BAF_Binom, self).fit( - start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds - ) From 2bd629dd7917b6226e6399a6df5a4b50b4b60063 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:17:44 -0400 Subject: [PATCH 094/125] abc for emission models --- src/calicost/utils_distribution_fitting.py | 262 ++++++++++----------- 1 file changed, 128 insertions(+), 134 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 6551ee1..1675f91 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -5,6 +5,7 @@ import os import sys import time +from abc import ABC, abstractmethod import numpy as np import scipy @@ -18,7 +19,6 @@ from sklearn import cluster from sklearn.utils import check_random_state from statsmodels.base.model import GenericLikelihoodModel -from abc import ABC, abstractmethod logger = logging.getLogger(__name__) @@ -30,6 +30,7 @@ os.environ["OPENBLAS_NUM_THREADS"] = num_threads os.environ["OMP_NUM_THREADS"] = num_threads + def convert_params(mean, std): """ Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports @@ -45,46 +46,49 @@ def convert_params(mean, std): @contextlib.contextmanager def save_stdout(fpath): original = sys.stdout - + with open(fpath, "w") as ff: sys.stdout = ff + try: yield - # NB teardown finally: sys.stdout = original + class WeightedModel(GenericLikelihoodModel, ABC): """ An ABC for defined emission models. - Attributes - ---------- - endog : array, (n_samples,) - Y values. - exog : array, (n_samples, n_features) - Design matrix. - - weights : array, (n_samples,) - Sample weights. - exposure : array, (n_samples,) + Attributes ---------- + endog : array, (n_samples,) Y values. + exog : array, (n_samples, n_features) + Design matrix. + weights : array, (n_samples,) + Sample weights. + exposure : array, (n_samples,) Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ - def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwds): - super(WeightedModel, self).__init__(endog, exog, **kwds) - + + def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): + super().__init__(endog, exog, **kwargs) + + # NB unpack a single additional positional argument as tumor_proportion. + self.tumor_prop = args if len(args) == 1 else None + self.weights = weights self.exposure = exposure # NB Weight_BetaBinomial does not specify seed self.seed = seed - self.tumor_prop = tumor_prop self.__post_init__() - - logger.info(f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}.") - + + logger.info( + f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}." + ) + @abstractmethod def nloglikeobs(self, params): pass @@ -97,26 +101,23 @@ def get_default_start_params(self): def get_ext_param_name(self): pass - @classmethod @abstractmethod - def get_ninstance(cls): + def get_ninstance(self): pass - + @abstractmethod def __post_init__(self): # NB will increment the instance count for each derived class. pass - - def callback(self, params): - nloglike = self.nloglikeobs(params) - print(f"{params} {nloglike};") - - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): + def __callback__(self, params): + print(f"{params} {self.nloglikeobs(params)};") + + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): ext_param_name = self.get_ext_param_name() self.exog_names.append(ext_param_name) - + if start_params is None: if hasattr(self, "start_params"): start_params = self.start_params @@ -127,40 +128,49 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): else: start_params_str = "input" - logger.info(f"Starting {self.__class__.__name__} optimization @ ({start_params_str}) {start_params}.") - + logger.info( + f"Starting {self.__class__.__name__} optimization @ ({start_params_str}) {start_params}." + ) + start = time.time() - result = super(Weighted_Model, self).fit( + result = super().fit( start_params=start_params, maxiter=maxiter, maxfun=maxfun, skip_hessian=True, - callback=self.callback, + callback=self.__callback__, full_output=True, retall=True, disp=False, - **kwds + **kwargs, ) - # NB specific to nm (Nelder-Mead) optimization. + # NB specific to nm (Nelder-Mead) optimization. niter = result.mle_retvals["iterations"] - logger.info(f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + logger.info( + f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations." + ) return result + class Weighted_NegativeBinomial(WeightedModel): """ - Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion. - This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) + Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), + where exog is the design matrix, and params[-1] is 1 / overdispersion. This function + fits the NB params when samples are weighted by weights: + + max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ + ninstance = 0 - + def nloglikeobs(self, params): nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) - + n, p = convert_params(nb_mean, nb_std) return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) @@ -172,25 +182,27 @@ def get_ext_param_name(): return "alpha" def __post_init__(self): - Weighted_NegativeBinomial.ninstance += 1 - + pass + @classmethod def get_ninstance(cls): return cls.ninstance - + + class Weighted_NegativeBinomial_mix(WeightedModel): ninstance = 0 - + def nloglikeobs(self, params): nb_mean = self.exposure * ( self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop ) + nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) n, p = convert_params(nb_mean, nb_std) return -scipy.stats.nbinom.logpmf(self.endog, n, p).dot(self.weights) - + def get_default_start_params(self): return np.append(0.1 * np.ones(self.nparams), 0.01) @@ -199,29 +211,37 @@ def get_ext_param_name(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - + + class Weighted_BetaBinom(WeightedModel): """ - Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. - This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) + Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), + where p = exog @ params[:-1] and tau = params[-1]. This function fits the + BetaBin params when samples are weighted by weights: + + max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ + ninstance = 0 - + def nloglikeobs(self, params): a = (self.exog @ params[:-1]) * params[-1] - b = (1. - self.exog @ params[:-1]) * params[-1] + b = (1.0 - self.exog @ params[:-1]) * params[-1] - return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot( + self.weights + ) def get_default_start_params(self): - return np.append( - 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 - ) + return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) def get_ext_param_name(): return "tau" - + + class Weighted_BetaBinom_mix(WeightedModel_mix): + ninstance = 0 + def nloglikeobs(self, params): a = ( self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop) @@ -232,79 +252,78 @@ def nloglikeobs(self, params): + 0.5 * (1 - self.tumor_prop) ) * params[-1] - return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot( + self.weights + ) def get_default_start_params(self): - return np.append( - 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1 - ) - + return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) + def get_ext_param_name(): return "tau" - def __post_init__(self): + def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - -class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): - def __init__(self, endog, exog, tau, weights, exposure, **kwds): - super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds) - logger.info(f"Initializing Weighted_BetaBinom_fixdispersion model for endog.shape = {endog.shape}.") + +class Weighted_BetaBinom_fixdispersion(WeightedModel): + ninstance = 0 + + # NB custom __init__ required to handle tau. + def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs): + super().__init__(endog, exog, **kwargs) + + # NB unpack a single additional positional argument as tumor_proportion. + self.tumor_prop = args if len(args) == 1 else None self.tau = tau self.weights = weights self.exposure = exposure - def nloglikeobs(self, params): - a = (self.exog @ params) * self.tau - b = (1 - self.exog @ params) * self.tau + # NB Weighted_BetaBinom_fixdispersion does not specify seed previously. + self.seed = seed - return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) + self.__post_init__() - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): - if start_params is None: - if hasattr(self, "start_params"): - start_params = self.start_params - start_params_str = "existing" - else: - start_params = 0.1 * np.ones(self.nparams) - start_params_str = "default" - else: - start_params_str = "input" - - logger.info(f"Starting Weighted_BetaBinom_fixdispersion optimization @ ({start_params_str}) {start_params}.") + logger.info( + f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}." + ) - start = time.time() + def nloglikeobs(self, params): + a = (self.exog @ params) * self.tau + b = (1 - self.exog @ params) * self.tau - result = super(Weighted_BetaBinom_fixdispersion, self).fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=None, - full_output=True, - retall=False, - **kwds + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot( + self.weights ) - # NB specific to nm (Nelder-Mead) optimization. - niter = result.mle_retvals["iterations"] - - logger.info(f"Finished Weighted_BetaBinom_fixdispersion optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + def get_default_start_params(self): + return 0.1 * np.ones(self.nparams) - return result + def __post_init__(self): + pass -class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel): - def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds): - super(Weighted_BetaBinom_fixdispersion_mix, self).__init__(endog, exog, **kwds) +class Weighted_BetaBinom_fixdispersion_mix(WeightedModel): + # NB custom __init__ required to handle tau. + def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs): + super().__init__(endog, exog, **kwargs) - logger.info(f"Initializing Weighted_BetaBinom_fixdispersion_mix model for endog.shape = {endog.shape}.") + # NB unpack a single additional positional argument as tumor_proportion. + self.tumor_prop = args if len(args) == 1 else None self.tau = tau self.weights = weights self.exposure = exposure - self.tumor_prop = tumor_prop + + # NB Weighted_BetaBinom_fixdispersion does not specify seed previously. + self.seed = seed + + self.__post_init__() + + logger.info( + f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}." + ) def nloglikeobs(self, params): a = ( @@ -315,37 +334,12 @@ def nloglikeobs(self, params): (1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop) ) * self.tau - return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot(self.weights) - - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwds): - if start_params is None: - if hasattr(self, "start_params"): - start_params = self.start_params - start_params_str = "existing" - else: - start_params = 0.1 * np.ones(self.nparams) - start_params_str = "default" - else: - start_params_str = "input" - - logger.info(f"Starting Weighted_BetaBinom_fixdispersion_mix optimization @ ({start_params_str}) {start_params}.") - - start = time.time() - - result = super(Weighted_BetaBinom_fixdispersion_mix, self).fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=None, - full_output=True, - retall=False, - **kwds + return -scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b).dot( + self.weights ) - # NB specific to nm (Nelder-Mead) optimization. - niter = result.mle_retvals["iterations"] - - logger.info(f"Finished Weighted_BetaBinom_fixdispersion_mix optimization in {time.time() - start:.2f} seconds, with {niter} iterations.") + def get_default_start_params(self): + return 0.1 * np.ones(self.nparams) - return result + def __post_init__(self): + assert self.tumor_prop is not None, "Tumor proportion must be defined." From d4a059968dbe96e5bcf9d430515585a7b612aca6 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:19:33 -0400 Subject: [PATCH 095/125] fix --- src/calicost/utils_distribution_fitting.py | 23 +++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 1675f91..9885dbe 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -164,7 +164,6 @@ class Weighted_NegativeBinomial(WeightedModel): max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ - ninstance = 0 def nloglikeobs(self, params): @@ -212,6 +211,10 @@ def get_ext_param_name(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." + @classmethod + def get_ninstance(cls): + return cls.ninstance + class Weighted_BetaBinom(WeightedModel): """ @@ -238,6 +241,13 @@ def get_default_start_params(self): def get_ext_param_name(): return "tau" + def __post_init__(self): + pass + + @classmethod + def get_ninstance(cls): + return cls.ninstance + class Weighted_BetaBinom_mix(WeightedModel_mix): ninstance = 0 @@ -265,6 +275,9 @@ def get_ext_param_name(): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." + @classmethod + def get_ninstance(cls): + return cls.ninstance class Weighted_BetaBinom_fixdispersion(WeightedModel): ninstance = 0 @@ -303,6 +316,10 @@ def get_default_start_params(self): def __post_init__(self): pass + @classmethod + def get_ninstance(cls): + return cls.ninstance + class Weighted_BetaBinom_fixdispersion_mix(WeightedModel): # NB custom __init__ required to handle tau. @@ -343,3 +360,7 @@ def get_default_start_params(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." + + @classmethod + def get_ninstance(cls): + return cls.ninstance From 431936c3798f3dcf570807678ce8d7814ad2c0da Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:22:47 -0400 Subject: [PATCH 096/125] fixes --- src/calicost/utils_distribution_fitting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 9885dbe..ccd6828 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -213,7 +213,7 @@ def __post_init__(self): @classmethod def get_ninstance(cls): - return cls.ninstance + return cls.ninstance class Weighted_BetaBinom(WeightedModel): @@ -246,10 +246,10 @@ def __post_init__(self): @classmethod def get_ninstance(cls): - return cls.ninstance + return cls.ninstance -class Weighted_BetaBinom_mix(WeightedModel_mix): +class Weighted_BetaBinom_mix(WeightedModel): ninstance = 0 def nloglikeobs(self, params): From bed12f19ca1beb1f2e0cc06f60a8abfeddb31b6e Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:29:41 -0400 Subject: [PATCH 097/125] fix --- src/calicost/utils_distribution_fitting.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index ccd6828..0e98b66 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -110,6 +110,11 @@ def __post_init__(self): # NB will increment the instance count for each derived class. pass + @classmethod + @abstractmethod + def get_ninstance(cls): + pass + def __callback__(self, params): print(f"{params} {self.nloglikeobs(params)};") @@ -123,7 +128,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): start_params = self.start_params start_params_str = "existing" else: - start_params = self.default_start_params() + start_params = self.get_default_start_params() start_params_str = "default" else: start_params_str = "input" @@ -238,7 +243,7 @@ def nloglikeobs(self, params): def get_default_start_params(self): return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) - def get_ext_param_name(): + def get_ext_param_name(self): return "tau" def __post_init__(self): From ef27f84e979b1966809e621d0aa02c9fc38121cc Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:39:52 -0400 Subject: [PATCH 098/125] fix chain logging --- src/calicost/utils_distribution_fitting.py | 87 ++++++++++++++-------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 0e98b66..f65b84a 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -110,9 +110,8 @@ def __post_init__(self): # NB will increment the instance count for each derived class. pass - @classmethod @abstractmethod - def get_ninstance(cls): + def get_ninstance(self): pass def __callback__(self, params): @@ -139,17 +138,43 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): start = time.time() - result = super().fit( - start_params=start_params, - maxiter=maxiter, - maxfun=maxfun, - skip_hessian=True, - callback=self.__callback__, - full_output=True, - retall=True, - disp=False, - **kwargs, - ) + # NB kwargs = {'xtol': 0.0001, 'ftol': 0.0001, disp: False} + kwargs.pop("disp", None) + + tmp_path = f"{self.__class__.__name__.lower()}_chain.tmp" + + # TODO mkdir chains + ninst = self.get_ninstance() + final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt" + + with save_stdout(tmp_path): + result = super().fit( + start_params=start_params, + maxiter=maxiter, + maxfun=maxfun, + skip_hessian=True, + callback=self.__callback__, + full_output=True, + retall=True, + disp=False, + **kwargs, + ) + + with open(tmp_path) as fin: + with open(final_path, "w") as fout: + fout.write(f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n") + fout.write( + f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," + + ",".join( + f"{key}:{value}" for key, value in result.mle_retvals.items() + ) + + "\n" + ) + + for line in fin: + fout.write(line) + + os.remove(tmp_path) # NB specific to nm (Nelder-Mead) optimization. niter = result.mle_retvals["iterations"] @@ -169,6 +194,7 @@ class Weighted_NegativeBinomial(WeightedModel): max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ + ninstance = 0 def nloglikeobs(self, params): @@ -188,9 +214,8 @@ def get_ext_param_name(): def __post_init__(self): pass - @classmethod - def get_ninstance(cls): - return cls.ninstance + def get_ninstance(self): + return self.ninstance class Weighted_NegativeBinomial_mix(WeightedModel): @@ -216,10 +241,9 @@ def get_ext_param_name(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - @classmethod - def get_ninstance(cls): - return cls.ninstance - + def get_ninstance(self): + return self.ninstance + class Weighted_BetaBinom(WeightedModel): """ @@ -248,10 +272,9 @@ def get_ext_param_name(self): def __post_init__(self): pass - - @classmethod - def get_ninstance(cls): - return cls.ninstance + + def get_ninstance(self): + return self.ninstance class Weighted_BetaBinom_mix(WeightedModel): @@ -280,9 +303,9 @@ def get_ext_param_name(): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - @classmethod - def get_ninstance(cls): - return cls.ninstance + def get_ninstance(self): + return self.ninstance + class Weighted_BetaBinom_fixdispersion(WeightedModel): ninstance = 0 @@ -321,9 +344,8 @@ def get_default_start_params(self): def __post_init__(self): pass - @classmethod - def get_ninstance(cls): - return cls.ninstance + def get_ninstance(self): + return self.ninstance class Weighted_BetaBinom_fixdispersion_mix(WeightedModel): @@ -366,6 +388,5 @@ def get_default_start_params(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - @classmethod - def get_ninstance(cls): - return cls.ninstance + def get_ninstance(self): + return self.ninstance From 22bceb7006eb05d27d422f3853b1038277c26515 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:42:34 -0400 Subject: [PATCH 099/125] fix --- src/calicost/utils_distribution_fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index f65b84a..f3ea3e7 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -86,7 +86,7 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): self.__post_init__() logger.info( - f"Initializing {self.__class__.__name__} model for endog.shape = {endog.shape}." + f"Initializing {self.get_ninstance()}th instance of {self.__class__.__name__} model for endog.shape = {endog.shape}." ) @abstractmethod From 5787061df5120e7e08dcb996ca2853ce390ac806 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:48:43 -0400 Subject: [PATCH 100/125] gzip chains --- src/calicost/utils_distribution_fitting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index f3ea3e7..1a5c5a8 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -1,5 +1,6 @@ import contextlib import functools +import gzip import inspect import logging import os @@ -161,7 +162,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): ) with open(tmp_path) as fin: - with open(final_path, "w") as fout: + with gzip.open(final_path, "wt") as fout: fout.write(f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n") fout.write( f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," From 16f10e1189afe7c6795336bf59a86460ea02638e Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:52:00 -0400 Subject: [PATCH 101/125] fix --- src/calicost/utils_distribution_fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 1a5c5a8..25e9186 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -146,7 +146,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): # TODO mkdir chains ninst = self.get_ninstance() - final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt" + final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt.gzip" with save_stdout(tmp_path): result = super().fit( From 0424169b7abed422eb8b7501dfcb0ec65cddf6dd Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:53:06 -0400 Subject: [PATCH 102/125] fix --- src/calicost/utils_distribution_fitting.py | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 25e9186..58eb357 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -118,7 +118,7 @@ def get_ninstance(self): def __callback__(self, params): print(f"{params} {self.nloglikeobs(params)};") - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): + def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs): ext_param_name = self.get_ext_param_name() self.exog_names.append(ext_param_name) @@ -161,19 +161,20 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, **kwargs): **kwargs, ) - with open(tmp_path) as fin: - with gzip.open(final_path, "wt") as fout: - fout.write(f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n") - fout.write( - f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," - + ",".join( - f"{key}:{value}" for key, value in result.mle_retvals.items() + if write_chain: + with open(tmp_path) as fin: + with gzip.open(final_path, "wt") as fout: + fout.write(f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n") + fout.write( + f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," + + ",".join( + f"{key}:{value}" for key, value in result.mle_retvals.items() + ) + + "\n" ) - + "\n" - ) - for line in fin: - fout.write(line) + for line in fin: + fout.write(line) os.remove(tmp_path) From 207e873a745c46ff55a012d1bf066d2e3011419b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 10:55:49 -0400 Subject: [PATCH 103/125] update instance counts. --- src/calicost/utils_distribution_fitting.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 58eb357..8fc9cc8 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -214,7 +214,7 @@ def get_ext_param_name(): return "alpha" def __post_init__(self): - pass + Weighted_NegativeBinomial.ninstance += 1 def get_ninstance(self): return self.ninstance @@ -243,6 +243,8 @@ def get_ext_param_name(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." + Weighted_NegativeBinomial_mix.ninstance + def get_ninstance(self): return self.ninstance @@ -255,7 +257,6 @@ class Weighted_BetaBinom(WeightedModel): max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ - ninstance = 0 def nloglikeobs(self, params): @@ -273,7 +274,7 @@ def get_ext_param_name(self): return "tau" def __post_init__(self): - pass + Weighted_BetaBinom.ninstance += 1 def get_ninstance(self): return self.ninstance @@ -305,6 +306,8 @@ def get_ext_param_name(): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." + Weighted_BetaBinom_mix.ninstance += 1 + def get_ninstance(self): return self.ninstance @@ -344,7 +347,7 @@ def get_default_start_params(self): return 0.1 * np.ones(self.nparams) def __post_init__(self): - pass + Weighted_BetaBinom_fixdispersion.ninstance += 1 def get_ninstance(self): return self.ninstance @@ -390,5 +393,7 @@ def get_default_start_params(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." + Weighted_BetaBinom_fixdispersion_mix.ninstance += 1 + def get_ninstance(self): return self.ninstance From f945854396c54e52f4ff956733df49e7fc1a36d6 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:02:05 -0400 Subject: [PATCH 104/125] fix --- src/calicost/utils_distribution_fitting.py | 29 ++++++++++++++++------ 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 8fc9cc8..4c9f548 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -62,7 +62,8 @@ class WeightedModel(GenericLikelihoodModel, ABC): """ An ABC for defined emission models. - Attributes ---------- + Attributes + ---------- endog : array, (n_samples,) Y values. exog : array, (n_samples, n_features) Design matrix. @@ -71,7 +72,6 @@ class WeightedModel(GenericLikelihoodModel, ABC): exposure : array, (n_samples,) Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ - def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): super().__init__(endog, exog, **kwargs) @@ -84,6 +84,7 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): # NB Weight_BetaBinomial does not specify seed self.seed = seed + # NB __pos_init__ validates the expected tumor proportion and handles incrementing instance count. self.__post_init__() logger.info( @@ -92,6 +93,9 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): @abstractmethod def nloglikeobs(self, params): + """ + Negative log-likelihood for the emission model. + """ pass @abstractmethod @@ -100,22 +104,31 @@ def get_default_start_params(self): @abstractmethod def get_ext_param_name(self): + """ + Named parameter in the model. + """ pass @abstractmethod def get_ninstance(self): + """ + Return the instance count for the given model + """ pass @abstractmethod def __post_init__(self): - # NB will increment the instance count for each derived class. - pass - - @abstractmethod - def get_ninstance(self): + """ + Validation and customisation for the derived class. + E.g. validate the tumor_proportion and increment the instance + count of the derived class. + """ pass def __callback__(self, params): + """ + Define callback for writing parameter chain to file. + """ print(f"{params} {self.nloglikeobs(params)};") def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs): @@ -393,7 +406,7 @@ def get_default_start_params(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - Weighted_BetaBinom_fixdispersion_mix.ninstance += 1 + Weighted_BetaBinom_fixdispersion_mix.ninstance += 1 def get_ninstance(self): return self.ninstance From 16132593e973ff8c78feb87a7e4049c77aa17436 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:06:34 -0400 Subject: [PATCH 105/125] fix --- src/calicost/utils_distribution_fitting.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 4c9f548..2e21437 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -174,12 +174,20 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs, ) + # NB specific to nm (Nelder-Mead) optimization. + niter = result.mle_retvals["iterations"] + runtime = time.time() - start + + logger.info( + f"Finished {self.__class__.__name__} optimization in {runtime:.2f} seconds, with {niter} iterations." + ) + if write_chain: with open(tmp_path) as fin: with gzip.open(final_path, "wt") as fout: fout.write(f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n") fout.write( - f"# start_type:{start_params_str},shape:{self.endog.shape[0]}," + f"# start_type:{start_params_str},runtime:{runtime},shape:{self.endog.shape[0]}," + ",".join( f"{key}:{value}" for key, value in result.mle_retvals.items() ) @@ -190,14 +198,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, fout.write(line) os.remove(tmp_path) - - # NB specific to nm (Nelder-Mead) optimization. - niter = result.mle_retvals["iterations"] - - logger.info( - f"Finished {self.__class__.__name__} optimization in {time.time() - start:.2f} seconds, with {niter} iterations." - ) - + return result From 2218ebf33227b9ccf09b1de5a8e9c9121099419b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:11:35 -0400 Subject: [PATCH 106/125] fix --- src/calicost/utils_distribution_fitting.py | 75 +++++++++++----------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 2e21437..9998f8f 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -46,6 +46,9 @@ def convert_params(mean, std): @contextlib.contextmanager def save_stdout(fpath): + """ + Context manager to write stdout to fpath. + """ original = sys.stdout with open(fpath, "w") as ff: @@ -62,7 +65,7 @@ class WeightedModel(GenericLikelihoodModel, ABC): """ An ABC for defined emission models. - Attributes + Attributes ---------- endog : array, (n_samples,) Y values. exog : array, (n_samples, n_features) @@ -72,6 +75,7 @@ class WeightedModel(GenericLikelihoodModel, ABC): exposure : array, (n_samples,) Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ + def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): super().__init__(endog, exog, **kwargs) @@ -84,7 +88,7 @@ def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): # NB Weight_BetaBinomial does not specify seed self.seed = seed - # NB __pos_init__ validates the expected tumor proportion and handles incrementing instance count. + # NB __post_init__ validates the expected tumor proportion and handles incrementing instance count. self.__post_init__() logger.info( @@ -109,13 +113,6 @@ def get_ext_param_name(self): """ pass - @abstractmethod - def get_ninstance(self): - """ - Return the instance count for the given model - """ - pass - @abstractmethod def __post_init__(self): """ @@ -125,13 +122,26 @@ def __post_init__(self): """ pass + def get_ninstance(self): + """ + Return the instance count for the given model + """ + return self.ninstance + def __callback__(self, params): """ Define callback for writing parameter chain to file. """ print(f"{params} {self.nloglikeobs(params)};") - def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs): + def fit( + self, + start_params=None, + maxiter=10_000, + maxfun=5_000, + write_chain=True, + **kwargs, + ): ext_param_name = self.get_ext_param_name() self.exog_names.append(ext_param_name) @@ -174,22 +184,25 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, **kwargs, ) - # NB specific to nm (Nelder-Mead) optimization. + # NB specific to nm (Nelder-Mead) optimization. niter = result.mle_retvals["iterations"] runtime = time.time() - start - + logger.info( f"Finished {self.__class__.__name__} optimization in {runtime:.2f} seconds, with {niter} iterations." ) - + if write_chain: with open(tmp_path) as fin: with gzip.open(final_path, "wt") as fout: - fout.write(f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n") + fout.write( + f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n" + ) fout.write( f"# start_type:{start_params_str},runtime:{runtime},shape:{self.endog.shape[0]}," + ",".join( - f"{key}:{value}" for key, value in result.mle_retvals.items() + f"{key}:{value}" + for key, value in result.mle_retvals.items() ) + "\n" ) @@ -198,7 +211,7 @@ def fit(self, start_params=None, maxiter=10_000, maxfun=5_000, write_chain=True, fout.write(line) os.remove(tmp_path) - + return result @@ -210,7 +223,6 @@ class Weighted_NegativeBinomial(WeightedModel): max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ - ninstance = 0 def nloglikeobs(self, params): @@ -228,11 +240,10 @@ def get_ext_param_name(): return "alpha" def __post_init__(self): + assert self.tumor_prop is None + Weighted_NegativeBinomial.ninstance += 1 - def get_ninstance(self): - return self.ninstance - class Weighted_NegativeBinomial_mix(WeightedModel): ninstance = 0 @@ -258,9 +269,6 @@ def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." Weighted_NegativeBinomial_mix.ninstance - - def get_ninstance(self): - return self.ninstance class Weighted_BetaBinom(WeightedModel): @@ -271,6 +279,7 @@ class Weighted_BetaBinom(WeightedModel): max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) """ + ninstance = 0 def nloglikeobs(self, params): @@ -288,11 +297,10 @@ def get_ext_param_name(self): return "tau" def __post_init__(self): + assert self.tumor_prop is None + Weighted_BetaBinom.ninstance += 1 - def get_ninstance(self): - return self.ninstance - class Weighted_BetaBinom_mix(WeightedModel): ninstance = 0 @@ -320,10 +328,7 @@ def get_ext_param_name(): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - Weighted_BetaBinom_mix.ninstance += 1 - - def get_ninstance(self): - return self.ninstance + Weighted_BetaBinom_mix.ninstance += 1 class Weighted_BetaBinom_fixdispersion(WeightedModel): @@ -361,10 +366,9 @@ def get_default_start_params(self): return 0.1 * np.ones(self.nparams) def __post_init__(self): - Weighted_BetaBinom_fixdispersion.ninstance += 1 - - def get_ninstance(self): - return self.ninstance + assert self.tumor_prop is None + + Weighted_BetaBinom_fixdispersion.ninstance += 1 class Weighted_BetaBinom_fixdispersion_mix(WeightedModel): @@ -408,6 +412,3 @@ def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." Weighted_BetaBinom_fixdispersion_mix.ninstance += 1 - - def get_ninstance(self): - return self.ninstance From 37eee031d2e2eb4a58161385ee91fd00142de77b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:13:06 -0400 Subject: [PATCH 107/125] fix --- src/calicost/utils_distribution_fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 9998f8f..0345e4a 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -199,7 +199,7 @@ def fit( f"# {self.__class__.__name__} {ninst} @ {time.asctime()}\n" ) fout.write( - f"# start_type:{start_params_str},runtime:{runtime},shape:{self.endog.shape[0]}," + f"# start_type:{start_params_str},runtime:{runtime:.6f},shape:{self.endog.shape[0]}," + ",".join( f"{key}:{value}" for key, value in result.mle_retvals.items() From 9591ccc720a1bc307a81a61b4ab8fcdc15a1d231 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:30:45 -0400 Subject: [PATCH 108/125] cleanup before adding ARI for HMM states. --- src/calicost/hmm_NB_BB_nophasing_v2.py | 53 ++++++++++++++++---------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index eb6bfd5..7f5c62a 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -78,6 +78,7 @@ def compute_emission_probability_nb_betabinom( n_comp = X.shape[1] n_spots = X.shape[2] n_states = log_mu.shape[0] + # initialize log_emission log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) log_emission_baf = np.zeros((n_states, n_obs, n_spots)) @@ -335,48 +336,53 @@ def run_baum_welch_nb_bb( log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. alpha: size of n_states. Dispersioon parameter of each HMM state. """ - n_obs = X.shape[0] - n_comp = X.shape[1] - n_spots = X.shape[2] - + n_obs, n_comp, n_spots = X.shape + assert n_comp == 2 - + log_mu = ( np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu ) - + p_binom = ( np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom ) - + # NB initialize (inverse of) dispersion param in NB and BetaBinom alphas = ( 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas ) - + taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus - use_defaults = (init_log_mu is None) and (init_p_binom is None) and (init_alphas is None) and (init_taus is None) + use_defaults = ( + (init_log_mu is None) + and (init_p_binom is None) + and (init_alphas is None) + and (init_taus is None) + ) logger.info(f"Initial alphas:\n{alphas}") logger.info(f"Initial taus:\n{taus}") - - logger.info(f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults}).") - + + logger.info( + f"Initialized Baum Welch NB logmean shift, BetaBinom prob and dispersion params inverse (use_defaults = {use_defaults})." + ) + # NB initialize start probability and emission probability log_startprob = np.log(np.ones(n_states) / n_states) - + if n_states > 1: transmat = np.ones((n_states, n_states)) * (1 - self.t) / (n_states - 1) np.fill_diagonal(transmat, self.t) log_transmat = np.log(transmat) else: log_transmat = np.zeros((1, 1)) - + log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None # NB a trick to speed up BetaBinom optimization: taking only unique @@ -398,11 +404,11 @@ def run_baum_welch_nb_bb( X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus ) ) - log_emission = log_emission_rdr + log_emission_baf else: # compute mu as adjusted RDR if ((not log_gamma is None) or (r > 0)) and ("m" in self.params): logmu_shift = [] + for c in range(len(kwargs["sample_length"])): this_pred_cnv = ( np.argmax( @@ -451,7 +457,8 @@ def run_baum_welch_nb_bb( tumor_prop, ) ) - log_emission = log_emission_rdr + log_emission_baf + + log_emission = log_emission_rdr + log_emission_baf log_alpha = hmm_nophasing_v2.forward_lattice( lengths, @@ -482,10 +489,12 @@ def run_baum_welch_nb_bb( new_log_startprob = new_log_startprob.flatten() else: new_log_startprob = log_startprob + if "t" in self.params: new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag) else: new_log_transmat = log_transmat + if "m" in self.params: if tumor_prop is None: new_log_mu, new_alphas = ( @@ -515,6 +524,7 @@ def run_baum_welch_nb_bb( else: new_log_mu = log_mu new_alphas = alphas + if "p" in self.params: if tumor_prop is None: new_p_binom, new_taus = ( @@ -532,6 +542,7 @@ def run_baum_welch_nb_bb( # compute mu as adjusted RDR if "m" in self.params: mu = [] + for c in range(len(kwargs["sample_length"])): this_pred_cnv = ( np.argmax( @@ -560,6 +571,7 @@ def run_baum_welch_nb_bb( ) else: weighted_tp = tumor_prop + new_p_binom, new_taus = ( update_emission_params_bb_nophasing_uniqvalues_mix( unique_values_bb, @@ -577,16 +589,17 @@ def run_baum_welch_nb_bb( new_taus = taus logger.info( - f"EM convergence metrics (v2): {np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, {np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}, {np.mean(np.abs(new_log_mu - log_mu))}, {np.mean(np.abs(new_p_binom - p_binom))}" + f"EM convergence metrics (v2): startprob={np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, transmat={np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}," + +"mu={np.mean(np.abs(new_log_mu - log_mu))}, pbinom={np.mean(np.abs(new_p_binom - p_binom))}" ) - + if ( np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat))) < tol and np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol ): break - + log_startprob = new_log_startprob log_transmat = new_log_transmat log_mu = new_log_mu @@ -598,7 +611,7 @@ def run_baum_welch_nb_bb( logger.info(f"Fitted (mu, p):\n{np.hstack([new_log_mu, new_p_binom])}") logger.info(f"Fitted (alphas, taus):\n{np.hstack([new_alphas, new_taus])}") - + return ( new_log_mu, new_alphas, From af2fa1ae5b76b7df1fc0a48b548581080b57ea0b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:32:26 -0400 Subject: [PATCH 109/125] precision on likelihood chain --- src/calicost/utils_distribution_fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 0345e4a..09e5266 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -132,7 +132,7 @@ def __callback__(self, params): """ Define callback for writing parameter chain to file. """ - print(f"{params} {self.nloglikeobs(params)};") + print(f"{params} {self.nloglikeobs(params):.6f};") def fit( self, From 8416be5afa0b8014866c62750b972d3b6ac8c06c Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:38:13 -0400 Subject: [PATCH 110/125] fix --- src/calicost/hmm_NB_BB_nophasing_v2.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 7f5c62a..71af39d 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -13,6 +13,7 @@ from calicost.utils_distribution_fitting import * from calicost.utils_hmm import * import networkx as nx +from sklearn.metrics import adjusted_rand_score logger = logging.getLogger(__name__) @@ -383,6 +384,7 @@ def run_baum_welch_nb_bb( else: log_transmat = np.zeros((1, 1)) + # NB gamma[i,t] = P(q_t = i | O, lambda), n_states * n_observations; log_gamma = kwargs["log_gamma"] if "log_gamma" in kwargs else None # NB a trick to speed up BetaBinom optimization: taking only unique @@ -476,12 +478,21 @@ def run_baum_welch_nb_bb( log_sitewise_transmat, ) - log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_nophasing( log_alpha, log_beta, log_transmat, log_emission ) + + log_gamma = compute_posterior_obs(log_alpha, log_beta) + + pred_states = np.argmax(log_gamma, axis=0) + + if last_pred_states is None: + last_pred_states = pred_states + + ari = {adjusted_rand_score(last_pred_states, pred_states)} + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.") if "s" in self.params: From 9882e5e0d227efbecde70a0c772934a6086aad0d Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:41:16 -0400 Subject: [PATCH 111/125] add HMM ARI --- src/calicost/hmm_NB_BB_nophasing_v2.py | 4 +++- src/calicost/hmm_NB_BB_phaseswitch.py | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 71af39d..ebae623 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -395,7 +395,9 @@ def run_baum_welch_nb_bb( unique_values_bb, mapping_matrices_bb = construct_unique_matrix( X[:, 1, :], total_bb_RD ) - + + last_pred_states = None + for r in range(max_iter): logger.info("-" * 250) logger.info(f"Calculating E-step (v2) for iteration {r} of {max_iter}.") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index b32191b..8d7b12e 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -13,6 +13,7 @@ from calicost.utils_distribution_fitting import * from calicost.hmm_NB_BB_nophasing import * from calicost.hmm_NB_BB_nophasing_v2 import * +from sklearn.metrics import adjusted_rand_score import networkx as nx logger = logging.getLogger(__name__) @@ -439,6 +440,8 @@ def run_baum_welch_nb_bb( X[:, 1, :], total_bb_RD ) + last_pred_states = None + for r in range(max_iter): logger.info("-" * 250) logger.info( @@ -483,12 +486,21 @@ def run_baum_welch_nb_bb( log_sitewise_transmat, ) - log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_sitewise( log_alpha, log_beta, log_transmat, log_emission ) + + log_gamma = compute_posterior_obs(log_alpha, log_beta) + pred_states = np.argmax(log_gamma, axis=0) + + if last_pred_states is None: + last_pred_states = pred_states + + ari = {adjusted_rand_score(last_pred_states, pred_states)} + + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + logger.info( f"Calculating M-step (sitewise) for iteration {r} of {max_iter}." ) From 0febc15c03086df99a44c787868eef52c69f443e Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:43:42 -0400 Subject: [PATCH 112/125] fix --- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmm_NB_BB_phaseswitch.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index ebae623..58271ca 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -493,7 +493,7 @@ def run_baum_welch_nb_bb( ari = {adjusted_rand_score(last_pred_states, pred_states)} - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).") logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 8d7b12e..eef99a1 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -495,11 +495,11 @@ def run_baum_welch_nb_bb( pred_states = np.argmax(log_gamma, axis=0) if last_pred_states is None: - last_pred_states = pred_states + last_pred_states = pred_states ari = {adjusted_rand_score(last_pred_states, pred_states)} - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).") logger.info( f"Calculating M-step (sitewise) for iteration {r} of {max_iter}." From e0c37128cf3e8b2f24a28f60498524fa9cea1b02 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:44:45 -0400 Subject: [PATCH 113/125] fix --- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmm_NB_BB_phaseswitch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 58271ca..d946a16 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -491,7 +491,7 @@ def run_baum_welch_nb_bb( if last_pred_states is None: last_pred_states = pred_states - ari = {adjusted_rand_score(last_pred_states, pred_states)} + ari = adjusted_rand_score(last_pred_states, pred_states) logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index eef99a1..9d2c6be 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -497,7 +497,7 @@ def run_baum_welch_nb_bb( if last_pred_states is None: last_pred_states = pred_states - ari = {adjusted_rand_score(last_pred_states, pred_states)} + ari = adjusted_rand_score(last_pred_states, pred_states) logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).") From ec676e6c445c0ad38bce2361ea38539cd6b2a205 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:49:45 -0400 Subject: [PATCH 114/125] fix --- src/calicost/hmm_NB_BB_nophasing_v2.py | 10 +++++----- src/calicost/hmm_NB_BB_phaseswitch.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index d946a16..2fdaf29 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -488,12 +488,12 @@ def run_baum_welch_nb_bb( pred_states = np.argmax(log_gamma, axis=0) - if last_pred_states is None: - last_pred_states = pred_states - - ari = adjusted_rand_score(last_pred_states, pred_states) + if last_pred_states is not None: + ari = adjusted_rand_score(last_pred_states, pred_states) + + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).") + last_pred_states = pred_states logger.info(f"Calculating M-step (v2) for iteration {r} of {max_iter}.") diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 9d2c6be..51324da 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -494,12 +494,12 @@ def run_baum_welch_nb_bb( pred_states = np.argmax(log_gamma, axis=0) - if last_pred_states is None: - last_pred_states = pred_states + if last_pred_states is not None: + ari = adjusted_rand_score(last_pred_states, pred_states) - ari = adjusted_rand_score(last_pred_states, pred_states) + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f} (first iteration burn-in).") + last_pred_states = pred_states logger.info( f"Calculating M-step (sitewise) for iteration {r} of {max_iter}." From 1b56487d85afd3837c3d2349f9f257aa7f222476 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:50:40 -0400 Subject: [PATCH 115/125] fix --- src/calicost/hmm_NB_BB_phaseswitch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 51324da..d0ede40 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -495,10 +495,10 @@ def run_baum_welch_nb_bb( pred_states = np.argmax(log_gamma, axis=0) if last_pred_states is not None: - ari = adjusted_rand_score(last_pred_states, pred_states) + ari = adjusted_rand_score(last_pred_states, pred_states) logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") - + last_pred_states = pred_states logger.info( From 2977647bdf8ccca90c26b1def421415b93e39733 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 11:53:23 -0400 Subject: [PATCH 116/125] fix --- src/calicost/hmm_NB_BB_nophasing.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index 0a436b9..e88ab7b 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -9,6 +9,7 @@ from tqdm import trange import statsmodels.api as sm from statsmodels.base.model import GenericLikelihoodModel +from sklearn.metrics import adjusted_rand_score import copy from calicost.utils_distribution_fitting import * from calicost.utils_hmm import * @@ -363,6 +364,8 @@ def run_baum_welch_nb_bb( X[:, 1, :], total_bb_RD ) + last_pred_states = None + for r in range(max_iter): logger.info(f"Calculating E-step for iteration {r} of {max_iter}.") @@ -404,12 +407,21 @@ def run_baum_welch_nb_bb( log_sitewise_transmat, ) - log_gamma = compute_posterior_obs(log_alpha, log_beta) - log_xi = compute_posterior_transition_nophasing( log_alpha, log_beta, log_transmat, log_emission ) + log_gamma = compute_posterior_obs(log_alpha, log_beta) + + pred_states = np.argmax(log_gamma, axis=0) + + if last_pred_states is not None: + ari = adjusted_rand_score(last_pred_states, pred_states) + + logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + + last_pred_states = pred_states + logger.info(f"Calculating M-step for iteration {r} of {max_iter}.") if "s" in self.params: From ea01b9ef349a7bcf9851ac98ac8a1c0a954da3f7 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 12:01:58 -0400 Subject: [PATCH 117/125] make parent dirs for chains --- src/calicost/utils_distribution_fitting.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 09e5266..1eaea2e 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -7,6 +7,7 @@ import sys import time from abc import ABC, abstractmethod +from pathlib import Path import numpy as np import scipy @@ -167,9 +168,13 @@ def fit( tmp_path = f"{self.__class__.__name__.lower()}_chain.tmp" - # TODO mkdir chains ninst = self.get_ninstance() - final_path = f"chains/{self.__class__.__name__.lower()}_chain_{ninst}.txt.gzip" + + # TODO mkdir chains + class_name = self.__class__.__name__.lower() + final_path = f"chains/{class_name}/{class_name}_chain_{ninst}.txt.gzip" + + Path(final_path).mkdir(parents=True, exist_ok=True) with save_stdout(tmp_path): result = super().fit( From 890d4a7f63330a27fe1e20bbd8a48605449a0c6b Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 12:05:09 -0400 Subject: [PATCH 118/125] fix --- src/calicost/utils_distribution_fitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 1eaea2e..181cc2c 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -174,7 +174,7 @@ def fit( class_name = self.__class__.__name__.lower() final_path = f"chains/{class_name}/{class_name}_chain_{ninst}.txt.gzip" - Path(final_path).mkdir(parents=True, exist_ok=True) + Path(final_path).parent.mkdir(parents=True, exist_ok=True) with save_stdout(tmp_path): result = super().fit( From 4a1039c759b2e9daa9bd11380ce388e19703c570 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 12:11:10 -0400 Subject: [PATCH 119/125] add hamming distances --- src/calicost/hmm_NB_BB_nophasing.py | 3 ++- src/calicost/hmm_NB_BB_nophasing_v2.py | 5 +++-- src/calicost/hmm_NB_BB_phaseswitch.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index e88ab7b..541acca 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -417,8 +417,9 @@ def run_baum_welch_nb_bb( if last_pred_states is not None: ari = adjusted_rand_score(last_pred_states, pred_states) + hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.") last_pred_states = pred_states diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 2fdaf29..9a1b9be 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -490,8 +490,9 @@ def run_baum_welch_nb_bb( if last_pred_states is not None: ari = adjusted_rand_score(last_pred_states, pred_states) - - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + hamm = sum(last_pred_states != pred_states) + + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.") last_pred_states = pred_states diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index d0ede40..1e4c34e 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -496,8 +496,9 @@ def run_baum_welch_nb_bb( if last_pred_states is not None: ari = adjusted_rand_score(last_pred_states, pred_states) + hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden State (v2) ARI for iteration {r} = {ari:.6f}.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.") last_pred_states = pred_states From ec8881b3e86f10844f3afa90d7b0e6b94d264edc Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 13:53:57 -0400 Subject: [PATCH 120/125] fixes --- src/calicost/utils_distribution_fitting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 181cc2c..52b4f5c 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -77,11 +77,11 @@ class WeightedModel(GenericLikelihoodModel, ABC): Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. """ - def __init__(self, endog, exog, weights, exposure, *args, seed=0, **kwargs): + def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwargs): super().__init__(endog, exog, **kwargs) # NB unpack a single additional positional argument as tumor_proportion. - self.tumor_prop = args if len(args) == 1 else None + self.tumor_prop = tumor_prop self.weights = weights self.exposure = exposure @@ -241,7 +241,7 @@ def nloglikeobs(self, params): def get_default_start_params(self): return np.append(0.1 * np.ones(self.exog.shape[1]), 0.01) - def get_ext_param_name(): + def get_ext_param_name(self): return "alpha" def __post_init__(self): @@ -327,7 +327,7 @@ def nloglikeobs(self, params): def get_default_start_params(self): return np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) - def get_ext_param_name(): + def get_ext_param_name(self): return "tau" def __post_init__(self): From d883c3497e2e93375651a3b62d1b468636be2dd4 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 13:57:17 -0400 Subject: [PATCH 121/125] add # of states for hamming. --- src/calicost/hmm_NB_BB_nophasing.py | 2 +- src/calicost/hmm_NB_BB_nophasing_v2.py | 4 ++-- src/calicost/hmm_NB_BB_phaseswitch.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index 541acca..ce93881 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -419,7 +419,7 @@ def run_baum_welch_nb_bb( ari = adjusted_rand_score(last_pred_states, pred_states) hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.") last_pred_states = pred_states diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index 9a1b9be..a16b8dc 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -492,7 +492,7 @@ def run_baum_welch_nb_bb( ari = adjusted_rand_score(last_pred_states, pred_states) hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.") last_pred_states = pred_states @@ -604,7 +604,7 @@ def run_baum_welch_nb_bb( logger.info( f"EM convergence metrics (v2): startprob={np.mean(np.abs(np.exp(new_log_startprob) - np.exp(log_startprob)))}, transmat={np.mean(np.abs(np.exp(new_log_transmat) - np.exp(log_transmat)))}," - +"mu={np.mean(np.abs(new_log_mu - log_mu))}, pbinom={np.mean(np.abs(new_p_binom - p_binom))}" + + f"mu={np.mean(np.abs(new_log_mu - log_mu))}, pbinom={np.mean(np.abs(new_p_binom - p_binom))}" ) if ( diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 1e4c34e..d38cbb9 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -498,7 +498,7 @@ def run_baum_welch_nb_bb( ari = adjusted_rand_score(last_pred_states, pred_states) hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f}.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.") last_pred_states = pred_states From 6f66d31e4dfbf2d1aaed3b00710092b83eaf5fe4 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 14:12:49 -0400 Subject: [PATCH 122/125] fix --- src/calicost/hmm_NB_BB_nophasing.py | 2 +- src/calicost/hmm_NB_BB_nophasing_v2.py | 2 +- src/calicost/hmm_NB_BB_phaseswitch.py | 2 +- src/calicost/utils_distribution_fitting.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calicost/hmm_NB_BB_nophasing.py b/src/calicost/hmm_NB_BB_nophasing.py index ce93881..0b182d1 100644 --- a/src/calicost/hmm_NB_BB_nophasing.py +++ b/src/calicost/hmm_NB_BB_nophasing.py @@ -419,7 +419,7 @@ def run_baum_welch_nb_bb( ari = adjusted_rand_score(last_pred_states, pred_states) hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(last_pred_states)} states.") last_pred_states = pred_states diff --git a/src/calicost/hmm_NB_BB_nophasing_v2.py b/src/calicost/hmm_NB_BB_nophasing_v2.py index a16b8dc..241e5da 100644 --- a/src/calicost/hmm_NB_BB_nophasing_v2.py +++ b/src/calicost/hmm_NB_BB_nophasing_v2.py @@ -492,7 +492,7 @@ def run_baum_welch_nb_bb( ari = adjusted_rand_score(last_pred_states, pred_states) hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(last_pred_states)} states.") last_pred_states = pred_states diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index d38cbb9..f1e25a0 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -498,7 +498,7 @@ def run_baum_welch_nb_bb( ari = adjusted_rand_score(last_pred_states, pred_states) hamm = sum(last_pred_states != pred_states) - logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(log_gamma)} states.") + logger.info(f"Found Hidden States (v2) for iteration {r} with ARI = {ari:.6f} and Hamming = {hamm:.1f} for {len(last_pred_states)} states.") last_pred_states = pred_states diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index 52b4f5c..ccc07f0 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -194,7 +194,7 @@ def fit( runtime = time.time() - start logger.info( - f"Finished {self.__class__.__name__} optimization in {runtime:.2f} seconds, with {niter} iterations." + f"{self.__class__.__name__} optimization in {runtime:.2f}s, with {niter} iterations. Best-fit: {result.params}" ) if write_chain: @@ -216,7 +216,7 @@ def fit( fout.write(line) os.remove(tmp_path) - + return result From 28b0b817a5f2ece6f933dfabccf89ed968200e52 Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 15:22:25 -0400 Subject: [PATCH 123/125] fix increment bug. --- src/calicost/utils_distribution_fitting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index ccc07f0..ade736c 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -80,7 +80,6 @@ class WeightedModel(GenericLikelihoodModel, ABC): def __init__(self, endog, exog, weights, exposure, tumor_prop=None, seed=0, **kwargs): super().__init__(endog, exog, **kwargs) - # NB unpack a single additional positional argument as tumor_proportion. self.tumor_prop = tumor_prop self.weights = weights @@ -273,7 +272,7 @@ def get_ext_param_name(self): def __post_init__(self): assert self.tumor_prop is not None, "Tumor proportion must be defined." - Weighted_NegativeBinomial_mix.ninstance + Weighted_NegativeBinomial_mix.ninstance += 1 class Weighted_BetaBinom(WeightedModel): From 3463cc33852e3fb825b2c92bcbc85787f9c3858f Mon Sep 17 00:00:00 2001 From: "Michael J. Wilson" Date: Mon, 26 Aug 2024 15:25:47 -0400 Subject: [PATCH 124/125] fix --- src/calicost/utils_distribution_fitting.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/calicost/utils_distribution_fitting.py b/src/calicost/utils_distribution_fitting.py index ade736c..8e0cf4f 100644 --- a/src/calicost/utils_distribution_fitting.py +++ b/src/calicost/utils_distribution_fitting.py @@ -339,11 +339,10 @@ class Weighted_BetaBinom_fixdispersion(WeightedModel): ninstance = 0 # NB custom __init__ required to handle tau. - def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs): + def __init__(self, endog, exog, tau, weights, exposure, *args, tumor_prop=None, seed=0, **kwargs): super().__init__(endog, exog, **kwargs) - # NB unpack a single additional positional argument as tumor_proportion. - self.tumor_prop = args if len(args) == 1 else None + self.tumor_prop = tumor_prop self.tau = tau self.weights = weights @@ -377,11 +376,10 @@ def __post_init__(self): class Weighted_BetaBinom_fixdispersion_mix(WeightedModel): # NB custom __init__ required to handle tau. - def __init__(self, endog, exog, tau, weights, exposure, *args, seed=0, **kwargs): + def __init__(self, endog, exog, tau, weights, exposure, *args, tumor_prop=None, seed=0, **kwargs): super().__init__(endog, exog, **kwargs) - # NB unpack a single additional positional argument as tumor_proportion. - self.tumor_prop = args if len(args) == 1 else None + self.tumor_prop = tumor_prop self.tau = tau self.weights = weights From 5c95c473e321bff85290c46a449dc040b123b1dc Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 2 Sep 2024 10:11:59 -0400 Subject: [PATCH 125/125] remove examples