diff --git a/README.md b/README.md index a0a38ec..7957129 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Replace the following paths `config.yaml`: Then you can run preprocessing pipeline by ``` -snakemake --cores --configfile config.yaml --snakefile calicost.smk all +snakemake --cores --configfile config.yaml --snakefile calicost.smk --keep-incomplete all ``` ### Inferring tumor purity per spot (optional) @@ -229,4 +229,4 @@ The CalicoST manuscript is available on bioRxiv. If you use CalicoST for your wo year={2024}, publisher={Cold Spring Harbor Laboratory} } -``` \ No newline at end of file +``` diff --git a/calicost.smk b/calicost.smk index 0444ec2..5643cdc 100644 --- a/calicost.smk +++ b/calicost.smk @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import scipy +from pathlib import Path import calicost.arg_parse import calicost.parse_input @@ -23,6 +24,13 @@ rule link_or_merge_bam: "{outputdir}/logs/link_or_merge_bam.log" run: if "bamlist" in config: + # check all file exists: + df_entries = pd.read_csv(config["bamlist"], sep='\t', index_col=None, header=None) + for i in range(df_entries.shape[0]): + assert Path(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists() or Path(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv").exists(), f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv(.gz) doesn't exist!" + assert Path(f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam").exists(), f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam doesn't exist!" + assert Path(f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam.bai").exists(), f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam.bai doesn't exist!" + # merged BAM file shell(f"python {config['calicost_dir']}/utils/merge_bamfile.py -b {config['bamlist']} -o {params.outputdir}/ >> {log} 2>&1") shell(f"samtools sort -m {params.samtools_sorting_mem} -o {output.bam} {params.outputdir}/unsorted_possorted_genome_bam.bam >> {log} 2>&1") @@ -30,22 +38,32 @@ rule link_or_merge_bam: shell(f"rm -fr {params.outputdir}/unsorted_possorted_genome_bam.bam") # merged barcodes - df_entries = pd.read_csv(config["bamlist"], sep='\t', index_col=None, header=None) df_barcodes = [] for i in range(df_entries.shape[0]): - tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None, index_col=None) + if Path(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists(): + tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None, index_col=None) + else: + tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv", header=None, index_col=None) tmpdf.iloc[:,0] = [f"{x}_{df_entries.iloc[i,1]}" for x in tmpdf.iloc[:,0]] df_barcodes.append( tmpdf ) df_barcodes = pd.concat(df_barcodes, ignore_index=True) df_barcodes.to_csv(f"{output.barcodefile}", sep='\t', index=False, header=False) else: + # check all file exists: + assert Path(f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists() or Path(f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv").exists(), f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv(.gz) doesn't exist!" + assert Path(f"{config['spaceranger_dir']}/possorted_genome_bam.bam").exists(), f"{config['spaceranger_dir']}/possorted_genome_bam.bam doesn't exist!" + assert Path(f"{config['spaceranger_dir']}/possorted_genome_bam.bam.bai").exists(), f"{config['spaceranger_dir']}/possorted_genome_bam.bam.bai doesn't exist!" + # BAM file assert "spaceranger_dir" in config print("softlink of possorted_genome_bam.bam") shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam {output.bam}") shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam.bai {output.bai}") # barcodes - shell(f"gunzip -c {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz > {output.barcodefile}") + if Path(f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists(): + shell(f"gunzip -c {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz > {output.barcodefile}") + else: + shell(f"cp {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv > {output.barcodefile}") diff --git a/src/calicost/arg_parse.py b/src/calicost/arg_parse.py index 32f5570..6208b70 100644 --- a/src/calicost/arg_parse.py +++ b/src/calicost/arg_parse.py @@ -26,10 +26,17 @@ def load_default_config(): "supervision_clone_file" : None, "filtergenelist_file" : None, "filterregion_file" : None, + "htblock_min_snps" : 1, + "initial_min_umi" : 15, "secondary_min_umi" : 300, "min_snpumi_perspot" : 50, 'min_percent_expressed_spots' : 0.005, "bafonly" : False, + # prephasing + "PREPHASING_select_tumor" : False, + "PREPHASING_spot_cluster_file" : None, + "PREPHASING_num_bins" : 3000, + "PREPHASING_min_tumor_spots" : 100, # phase switch probability "nu" : 1.0, "logphase_shift" : -2.0, @@ -86,10 +93,17 @@ def load_default_config(): "supervision_clone_file" : "str", "filtergenelist_file" : "str", "filterregion_file" : "str", + "htblock_min_snps" : "int", + "initial_min_umi" : "int", "secondary_min_umi" : "int", "min_snpumi_perspot" : "int", 'min_percent_expressed_spots' : "float", "bafonly" : "bool", + # prephasing + "PREPHASING_select_tumor" : "bool", + "PREPHASING_spot_cluster_file" : "str", + "PREPHASING_num_bins" : "int", + "PREPHASING_min_tumor_spots" : "int", # phase switch probability "nu" : "float", "logphase_shift" : "float", @@ -130,7 +144,8 @@ def load_default_config(): category_names = ["", "# supporting files and preprocessing arguments", "# phase switch probability", "# HMRF configurations", "# HMM configurations", "# integer copy number"] category_elements = [["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], \ - ["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \ + ["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "htblock_min_snps", "initial_min_umi", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \ + ["PREPHASING_select_tumor", "PREPHASING_spot_cluster_file", "PREPHASING_num_bins", "PREPHASING_min_tumor_spots"], \ ["nu", "logphase_shift", "npart_phasing"], \ ["n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold", "max_iter_outer", "nodepotential", "initialization_method", "num_hmrf_initialization_start", "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", "construct_adjacency_w"], \ ["n_states", "params", "t", "t_phaseing", "fix_NB_dispersion", "shared_NB_dispersion", "fix_BB_dispersion", "shared_BB_dispersion", "max_iter", "tol", "gmm_random_state", "np_threshold", "np_eventminlen"], \ diff --git a/src/calicost/calicost_main.py b/src/calicost/calicost_main.py index 8990bc8..2822562 100644 --- a/src/calicost/calicost_main.py +++ b/src/calicost/calicost_main.py @@ -136,8 +136,8 @@ def main(configuration_file): break # filter bins based on normal index_normal = np.where(normal_candidate)[0] - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ - single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file']) + # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ + # single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file']) assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0] df_bininfo = genesnp_to_bininfo(df_gene_snp) copy_single_X_rdr = copy.copy(single_X[:,0,:]) diff --git a/src/calicost/hmm_NB_BB_phaseswitch.py b/src/calicost/hmm_NB_BB_phaseswitch.py index 630651f..f451245 100644 --- a/src/calicost/hmm_NB_BB_phaseswitch.py +++ b/src/calicost/hmm_NB_BB_phaseswitch.py @@ -716,7 +716,7 @@ def similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]]) merged_res = copy.copy(res) merged_res["new_assignment"] = new_assignment - merged_res["total_llf"] = np.NAN + merged_res["total_llf"] = np.nan merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) return merging_groups, merged_res diff --git a/src/calicost/hmrf.py b/src/calicost/hmrf.py index 185ca3a..8f3ca6d 100644 --- a/src/calicost/hmrf.py +++ b/src/calicost/hmrf.py @@ -290,7 +290,7 @@ def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds= # unique_assignment = np.unique(new_assignment) merged_res = copy.copy(res) merged_res["new_assignment"] = new_assignment - merged_res["total_llf"] = np.NAN + merged_res["total_llf"] = np.nan merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ]) return merging_groups, merged_res diff --git a/src/calicost/parse_input.py b/src/calicost/parse_input.py index 9bdd862..6ce9aeb 100644 --- a/src/calicost/parse_input.py +++ b/src/calicost/parse_input.py @@ -3,6 +3,7 @@ import scipy import pandas as pd from pathlib import Path +import pickle from sklearn.metrics import adjusted_rand_score import scanpy as sc import anndata @@ -60,8 +61,42 @@ def parse_visium(config): smooth_mat : array, (n_spots, n_spots) KNN smoothing matrix. """ - if "input_filelist" in config: - adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat = load_joint_data(config["input_filelist"], config["snp_dir"], config["alignment_files"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) + if not Path( f"{config['output_dir']}/parsed_inputs/exp_adata.h5ad" ).exists(): + if "input_filelist" in config: + adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat = load_joint_data(config["input_filelist"], config["snp_dir"], config["alignment_files"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) + sample_list = [adata.obs["sample"][0]] + for i in range(1, adata.shape[0]): + if adata.obs["sample"][i] != sample_list[-1]: + sample_list.append( adata.obs["sample"][i] ) + # convert sample name to index + sample_ids = np.zeros(adata.shape[0], dtype=int) + for s,sname in enumerate(sample_list): + index = np.where(adata.obs["sample"] == sname)[0] + sample_ids[index] = s + else: + adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(config["spaceranger_dir"], config["snp_dir"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) + adata.obs["sample"] = "unique_sample" + sample_list = [adata.obs["sample"][0]] + sample_ids = np.zeros(adata.shape[0], dtype=int) + across_slice_adjacency_mat = None + + adata.write( f"{config['output_dir']}/parsed_inputs/exp_adata.h5ad" ) + scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/filtered_cell_snp_Aallele.npz", cell_snp_Aallele ) + scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/filtered_cell_snp_Ballele.npz", cell_snp_Ballele ) + np.save( f"{config['output_dir']}/parsed_inputs/filtered_unique_snp_ids.npy", unique_snp_ids ) + if not across_slice_adjacency_mat is None: + scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/across_slice_adjacency_mat.npz", across_slice_adjacency_mat ) + coords = adata.obsm["X_pos"] + else: + adata = sc.read_h5ad( f"{config['output_dir']}/parsed_inputs/exp_adata.h5ad" ) + cell_snp_Aallele = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/filtered_cell_snp_Aallele.npz" ) + cell_snp_Ballele = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/filtered_cell_snp_Ballele.npz" ) + unique_snp_ids = np.load( f"{config['output_dir']}/parsed_inputs/filtered_unique_snp_ids.npy", allow_pickle=True ) + across_slice_adjacency_mat = None + if Path( f"{config['output_dir']}/parsed_inputs/across_slice_adjacency_mat.npz" ).exists(): + across_slice_adjacency_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/across_slice_adjacency_mat.npz" ) + coords = adata.obsm["X_pos"] + sample_list = [adata.obs["sample"][0]] for i in range(1, adata.shape[0]): if adata.obs["sample"][i] != sample_list[-1]: @@ -71,14 +106,6 @@ def parse_visium(config): for s,sname in enumerate(sample_list): index = np.where(adata.obs["sample"] == sname)[0] sample_ids[index] = s - else: - adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(config["spaceranger_dir"], config["snp_dir"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) - adata.obs["sample"] = "unique_sample" - sample_list = [adata.obs["sample"][0]] - sample_ids = np.zeros(adata.shape[0], dtype=int) - across_slice_adjacency_mat = None - - coords = adata.obsm["X_pos"] if not config["tumorprop_file"] is None: df_tumorprop = pd.read_csv(config["tumorprop_file"], sep="\t", header=0, index_col=0) @@ -90,14 +117,51 @@ def parse_visium(config): single_tumor_prop = None # read original data - df_gene_snp = combine_gene_snps(unique_snp_ids, config['hgtable_file'], adata) - df_gene_snp = create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids) - lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \ - adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + if not Path(f"{config['output_dir']}/parsed_inputs/df_gene_snp_combinegenesnps.pkl").exists(): + df_gene_snp = combine_gene_snps(unique_snp_ids, config['hgtable_file'], adata) + # save temp output + df_gene_snp.to_pickle(f"{config['output_dir']}/parsed_inputs/df_gene_snp_combinegenesnps.pkl") + else: + df_gene_snp = pd.read_pickle(f"{config['output_dir']}/parsed_inputs/df_gene_snp_combinegenesnps.pkl") + + # # TBD: need to set another user parameters to decide whether only restricted to SNPs within df_hgtable + # idx_snps_within = np.where(df_gene_snp[~df_gene_snp.is_interval].gene.notnull())[0] + # df_gene_snp = df_gene_snp[df_gene_snp.gene.notnull()] + # cell_snp_Aallele = cell_snp_Aallele[:, idx_snps_within] + # cell_snp_Ballele = cell_snp_Ballele[:, idx_snps_within] + # unique_snp_ids = unique_snp_ids[idx_snps_within] + + if not Path(f"{config['output_dir']}/parsed_inputs/df_gene_snp_haplotypeblock.pkl").exists(): + df_gene_snp = create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, htblock_min_snps=config['htblock_min_snps'], htblock_min_umi=config['initial_min_umi']) + # save temp output + df_gene_snp.to_pickle(f"{config['output_dir']}/parsed_inputs/df_gene_snp_haplotypeblock.pkl") + else: + df_gene_snp = pd.read_pickle(f"{config['output_dir']}/parsed_inputs/df_gene_snp_haplotypeblock.pkl") + # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \ + # adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + # # infer an initial phase using pseudobulk + # if not Path(f"{config['output_dir']}/initial_phase.npz").exists(): + # initial_clone_for_phasing = perform_partition(coords, sample_ids, x_part=config["npart_phasing"], y_part=config["npart_phasing"], single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"]) + # phase_indicator, refined_lengths = initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \ + # "sp", config["t_phaseing"], config["gmm_random_state"], config["fix_NB_dispersion"], config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], 30, 1e-3, threshold=config["tumorprop_threshold"]) + # np.savez(f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths) + # # map phase indicator to individual snps + # df_gene_snp['phase'] = np.where(df_gene_snp.snp_id.isnull(), None, df_gene_snp.block_id.map({i:x for i,x in enumerate(phase_indicator)}) ) + # else: + # tmp = dict(np.load(f"{config['output_dir']}/initial_phase.npz")) + # phase_indicator, refined_lengths = tmp["phase_indicator"], tmp["refined_lengths"] + + if not Path(f"{config['output_dir']}/parsed_inputs/counts_haplotypeblocks.pkl").exists(): + lengths, sp_single_X_rdr, sp_single_X_b, sp_single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \ + adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + pickle.dump( (lengths, sp_single_X_rdr, sp_single_X_b, sp_single_total_bb_RD, log_sitewise_transmat), open(f"{config['output_dir']}/parsed_inputs/counts_haplotypeblocks.pkl", "wb") ) + else: + lengths, sp_single_X_rdr, sp_single_X_b, sp_single_total_bb_RD, log_sitewise_transmat = pickle.load( open(f"{config['output_dir']}/parsed_inputs/counts_haplotypeblocks.pkl", "rb") ) + # infer an initial phase using pseudobulk if not Path(f"{config['output_dir']}/initial_phase.npz").exists(): initial_clone_for_phasing = perform_partition(coords, sample_ids, x_part=config["npart_phasing"], y_part=config["npart_phasing"], single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"]) - phase_indicator, refined_lengths = initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \ + phase_indicator, refined_lengths = initial_phase_given_partition(sp_single_X_b, lengths, sp_single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \ "sp", config["t_phaseing"], config["gmm_random_state"], config["fix_NB_dispersion"], config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], 30, 1e-3, threshold=config["tumorprop_threshold"]) np.savez(f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths) # map phase indicator to individual snps @@ -107,9 +171,18 @@ def parse_visium(config): phase_indicator, refined_lengths = tmp["phase_indicator"], tmp["refined_lengths"] # binning - df_gene_snp = create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, config['secondary_min_umi']) + if not Path(f"{config['output_dir']}/parsed_inputs/df_gene_snp_binned.pkl").exists(): + # df_gene_snp = create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, config['secondary_min_umi']) + df_gene_snp = create_bin_ranges(df_gene_snp, sp_single_total_bb_RD, refined_lengths, config['secondary_min_umi']) + # save temp output + df_gene_snp.to_pickle(f"{config['output_dir']}/parsed_inputs/df_gene_snp_binned.pkl") + else: + df_gene_snp = pd.read_pickle(f"{config['output_dir']}/parsed_inputs/df_gene_snp_binned.pkl") + # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_bins(df_gene_snp, \ + # adata, single_X, single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_bins(df_gene_snp, \ - adata, single_X, single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + sp_single_X_rdr, sp_single_X_b, sp_single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) + # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \ # single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi) @@ -127,7 +200,7 @@ def parse_visium(config): # assert single_X.shape[0] == len(log_sitewise_transmat) # expression count dataframe - exp_counts = pd.DataFrame.sparse.from_spmatrix( scipy.sparse.csc_matrix(adata.layers["count"]), index=adata.obs.index, columns=adata.var.index) + exp_counts = copy.deepcopy(adata.X) # smooth and adjacency matrix for each sample adjacency_mat, smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, @@ -139,12 +212,13 @@ def parse_visium(config): # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data) # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization if config["construct_adjacency_method"] == "KNN" and config["construct_adjacency_w"] == 0: - sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) ) + sc.pp.normalize_total(adata, target_sum=np.median(np.sum(adata.X,axis=1).A.flatten()) ) sc.pp.log1p(adata) sc.tl.pca(adata) sc.pp.neighbors(adata) sc.tl.umap(adata) coords = adata.obsm["X_umap"] + adata.X = exp_counts # create RDR-BAF table table_bininfo = genesnp_to_bininfo(df_gene_snp) @@ -161,7 +235,7 @@ def parse_visium(config): if not single_tumor_prop is None: table_meta["TUMOR_PROPORTION"] = single_tumor_prop - return table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp + return table_bininfo, table_rdrbaf, table_meta, adata, adjacency_mat, smooth_mat, df_gene_snp def load_tables_to_matrices(config): @@ -219,10 +293,10 @@ def load_tables_to_matrices(config): sample_ids[index] = s # expression UMI count matrix - exp_counts = pd.read_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" ) + adata = sc.read_h5ad( f"{config['output_dir']}/parsed_inputs/exp_adata.h5ad" ) return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ - barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts + barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, adata def run_parse_n_load(config): @@ -231,20 +305,19 @@ def run_parse_n_load(config): Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(), \ Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(), \ Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(), \ - Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists() ]) + Path(f"{config['output_dir']}/parsed_inputs/exp_adata.h5ad").exists() ]) if not np.all(file_exists): + # create directory + Path(f"{config['output_dir']}/parsed_inputs").mkdir(parents=True, exist_ok=True) + # process to tables - table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp = parse_visium(config) + table_bininfo, table_rdrbaf, table_meta, adata, adjacency_mat, smooth_mat, df_gene_snp = parse_visium(config) # table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat = parse_hatchetblock(config, cellsnplite_dir, bb_file) - - # save file - p = subprocess.Popen(f"mkdir -p {config['output_dir']}/parsed_inputs", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out,err = p.communicate() table_bininfo.to_csv( f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=True, index=False, sep="\t" ) table_rdrbaf.to_csv( f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=True, index=False, sep="\t" ) table_meta.to_csv( f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=True, index=False, sep="\t" ) - exp_counts.to_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" ) + adata.write( f"{config['output_dir']}/parsed_inputs/exp_adata.h5ad" ) scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat ) scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat ) # diff --git a/src/calicost/phasing.py b/src/calicost/phasing.py index d582ec5..13e0f36 100644 --- a/src/calicost/phasing.py +++ b/src/calicost/phasing.py @@ -47,14 +47,37 @@ def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_ return phase_indicator, refined_lengths -def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ +# def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ +# params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol, threshold, min_snpumi=2e3): +# EPS_BAF = 0.05 +# if single_tumor_prop is None: +# X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index) +# tumor_prop = None +# else: +# X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=threshold) + +def initial_phase_given_partition(sp_single_X_b, lengths, sp_single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol, threshold, min_snpumi=2e3): EPS_BAF = 0.05 if single_tumor_prop is None: - X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index) + idx_row = np.concatenate(initial_clone_index) + idx_col = np.concatenate([ [i] * len(x) for i,x in enumerate(initial_clone_index) ]) + mul = scipy.sparse.csr_matrix((np.ones(sp_single_X_b.shape[1]), (idx_row, idx_col)), shape=(sp_single_X_b.shape[1], len(initial_clone_index))) + X = np.zeros((sp_single_X_b.shape[0], 2, len(initial_clone_index))) + X[:,1,:] = (sp_single_X_b @ mul).toarray() + total_bb_RD = (sp_single_total_bb_RD @ mul).toarray() + base_nb_mean = np.zeros(total_bb_RD.shape) tumor_prop = None else: - X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=threshold) + filtered_initial_clone_index = [ x[single_tumor_prop[x] > threshold] for x in initial_clone_index ] + idx_row = np.concatenate(filtered_initial_clone_index) + idx_col = np.concatenate([ [i] * len(x) for i,x in enumerate(filtered_initial_clone_index) ]) + mul = scipy.sparse.csr_matrix((np.ones(len(idx_row)), (idx_row, idx_col)), shape=(sp_single_X_b.shape[1], len(filtered_initial_clone_index))) + X = np.zeros((sp_single_X_b.shape[0], 2, len(filtered_initial_clone_index))) + X[:,1,:] = (sp_single_X_b @ mul).toarray() + total_bb_RD = (sp_single_total_bb_RD @ mul).toarray() + base_nb_mean = np.zeros(total_bb_RD.shape) + tumor_prop = single_tumor_prop.values @ mul # pseudobulk HMM for phase_prob baf_profiles = np.zeros((X.shape[2], X.shape[0])) diff --git a/src/calicost/plot_spatial_clones.py b/src/calicost/plot_spatial_clones.py new file mode 100644 index 0000000..d2f4ea1 --- /dev/null +++ b/src/calicost/plot_spatial_clones.py @@ -0,0 +1,114 @@ +import sys +import argparse + +import numpy as np +import scipy +import pandas as pd +from itertools import cycle +import matplotlib +from matplotlib import pyplot as plt +from matplotlib.colors import LinearSegmentedColormap, ListedColormap +from matplotlib.gridspec import GridSpec +import seaborn +from matplotlib.lines import Line2D +import matplotlib.patches as mpatches + +from calicost.arg_parse import * +from calicost.parse_input import * +from calicost.utils_plotting import * + + +def plot_individual_spots_in_space_updated(coords, assignment, single_tumor_prop=None, sample_list=None, sample_ids=None, base_width=4, base_height=3, point_size=10, palette="Set2"): + # combine coordinates across samples + shifted_coords = copy.copy(coords) + if not (sample_ids is None): + x_offset = 0 + for s,sname in enumerate(sample_list): + index = np.where(sample_ids == s)[0] + shifted_coords[index,0] = shifted_coords[index,0] + x_offset + x_offset += np.max(coords[index,0]) + 10 + + # number of clones and samples + final_clone_ids = np.unique(assignment[~assignment.isnull()].values) + n_final_clones = len(final_clone_ids) + n_samples = 1 if sample_list is None else len(sample_list) + + # remove nan of single_tumor_prop + if not single_tumor_prop is None: + copy_single_tumor_prop = copy.copy(single_tumor_prop) + copy_single_tumor_prop[np.isnan(copy_single_tumor_prop)] = 0.5 + + fig, axes = plt.subplots(1, 1, figsize=(base_width*n_samples, base_height), dpi=200, facecolor="white") + if "clone 0" in final_clone_ids: + colorlist = ['lightgrey'] + seaborn.color_palette(palette, n_final_clones-1).as_hex() + else: + colorlist = seaborn.color_palette(palette, n_final_clones).as_hex() + + for c,cid in enumerate(final_clone_ids): + idx = np.where( (assignment.values==cid) )[0] + if single_tumor_prop is None: + seaborn.scatterplot(x=shifted_coords[idx,0], y=-shifted_coords[idx,1], s=point_size, color=colorlist[c], linewidth=0, legend=None, ax=axes) + else: + # cmap + this_full_cmap = seaborn.color_palette(f"blend:lightgrey,{colorlist[c]}", as_cmap=True) + quantile_colors = this_full_cmap(np.array([0, np.min(copy_single_tumor_prop[idx]), np.max(copy_single_tumor_prop[idx]), 1])) + quantile_colors = [matplotlib.colors.rgb2hex(x) for x in quantile_colors[1:-1]] + this_cmap = seaborn.color_palette(f"blend:{quantile_colors[0]},{quantile_colors[-1]}", as_cmap=True) + seaborn.scatterplot(x=shifted_coords[idx,0], y=-shifted_coords[idx,1], s=point_size, hue=copy_single_tumor_prop[idx], palette=this_cmap, linewidth=0, legend=None, ax=axes) + + legend_elements = [Line2D([0], [0], marker='o', color="w", markerfacecolor=colorlist[c], label=cid, markersize=10) for c,cid in enumerate(final_clone_ids)] + axes.legend(legend_elements, final_clone_ids, handlelength=0.1, loc="upper left", bbox_to_anchor=(1,1)) + axes.axis("off") + + fig.tight_layout() + return fig + + +def main(configuration_file, output_file, r_hmrf_initialization=None, base_width=4, base_height=3, point_size=20, palette='Set2'): + """ + Plot clones in space based on existing CalicoST output (output_dir/clone_rectangle_w/clone_labels.tsv) + + :param configuration_file: Path to the configuration file + :param output_file: Path to the output file + :param base_width: Width of the figure + :param base_height: Height of the figure + :param point_size: Size of the points + :param palette: Color palette + """ + # Load configuration + try: + config = read_configuration_file(configuration_file) + except: + config = read_joint_configuration_file(configuration_file) + + # Load data + lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ + barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) + + # Load results and plot + if r_hmrf_initialization is None: + r_hmrf_initialization = config["num_hmrf_initialization_start"] + + outdir = f"{config['output_dir']}/clone{config['n_clones']}_rectangle{r_hmrf_initialization}_w{config['spatial_weight']:.1f}" + assert Path(outdir).is_dir(), f"Directory {outdir} does not exist. Did you run CalicoST with the same random seed 'num_hmrf_initialization_start'?" + + df_clone_label = pd.read_csv(f"{outdir}/clone_labels.tsv", header=0, index_col=0, sep="\t") + assignment = pd.Series([f"clone {x}" for x in df_clone_label.clone_label.values]) + + fig = plot_individual_spots_in_space_updated(coords, assignment, single_tumor_prop, sample_list=sample_list, sample_ids=sample_ids, + base_width=base_width, base_height=base_height, point_size=point_size, palette=palette) + fig.savefig(output_file, transparent=True, bbox_inches="tight") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Plot clones in space based on existing CalicoST output") + parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) + parser.add_argument("--output_file", type=str, help="Path to the output file") + parser.add_argument("--r_hmrf_initialization", type=int, default=None, help="Random seed for HMRF initialization") + parser.add_argument("--base_width", type=float, default=4, help="Width of the figure") + parser.add_argument("--base_height", type=float, default=3, help="Height of the figure") + parser.add_argument("--point_size", type=int, default=20, help="Size of the points") + parser.add_argument("--palette", type=str, default="Set2", help="Color palette") + args = parser.parse_args() + + main(args.configfile, args.output_file, args.r_hmrf_initialization, args.base_width, args.base_height, args.point_size, args.palette) \ No newline at end of file diff --git a/src/calicost/utils_IO.py b/src/calicost/utils_IO.py index f248036..2c0faac 100644 --- a/src/calicost/utils_IO.py +++ b/src/calicost/utils_IO.py @@ -22,13 +22,12 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, normalidx_file, min_snpumis=50, min_percent_expressed_spots=0.005): ##### read raw UMI count matrix ##### if Path(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5").exists(): - adata = sc.read_10x_h5(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5") + adata = sc.read_10x_h5(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5", gex_only=False) elif Path(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5ad").exists(): adata = sc.read_h5ad(f"{spaceranger_dir}/filtered_feature_bc_matrix.h5ad") else: logging.error(f"{spaceranger_dir} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!") - adata.layers["count"] = adata.X.A.astype(int) cell_snp_Aallele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Aallele.npz") cell_snp_Ballele = scipy.sparse.load_npz(f"{snp_dir}/cell_snp_Ballele.npz") unique_snp_ids = np.load(f"{snp_dir}/unique_snp_ids.npy", allow_pickle=True) @@ -53,6 +52,7 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, df_pos.barcode = pd.Categorical(df_pos.barcode, categories=list(adata.obs.index), ordered=True) df_pos.sort_values(by="barcode", inplace=True) adata.obsm["X_pos"] = np.vstack([df_pos.x, df_pos.y]).T + adata.var_names_make_unique() # shared barcodes between adata and SNPs shared_barcodes = set(list(snp_barcodes.barcodes)) & set(list(adata.obs.index)) @@ -63,7 +63,7 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, adata = adata[ pd.Categorical(adata.obs.index, categories=list(snp_barcodes.barcodes), ordered=True).argsort(), : ] # filter out spots with too small number of UMIs - indicator = (np.sum(adata.layers["count"], axis=1) > min_snpumis) + indicator = (np.sum(adata.X, axis=1).A.flatten() > min_snpumis) adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] @@ -79,7 +79,7 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] print(adata) - print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.X, axis=1).A.flatten()) )) # remove genes in filtergenelist_file # ig_gene_list = pd.read_csv("/n/fs/ragr-data/users/congma/references/cellranger_refdata-gex-GRCh38-2020-A/genes/ig_gene_list.txt", header=None) @@ -88,7 +88,7 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, filter_gene_list = set(list( filter_gene_list.iloc[:,0] )) indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ]) adata = adata[:, indicator_filter] - print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.X, axis=1).A.flatten()) )) if not filterregion_file is None: regions = pd.read_csv(filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"]) @@ -111,8 +111,8 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, unique_snp_ids = unique_snp_ids[indicator_filter] clf = LocalOutlierFactor(n_neighbors=200) - label = clf.fit_predict( np.sum(adata.layers["count"], axis=0).reshape(-1,1) ) - adata.layers["count"][:, np.where(label==-1)[0]] = 0 + label = clf.fit_predict( np.sum(adata.X, axis=0).A.flatten().reshape(-1,1) ) + adata.X[:, np.where(label==-1)[0]] = 0 print("filter out {} outlier genes.".format( np.sum(label==-1) )) if not normalidx_file is None: @@ -121,7 +121,7 @@ def load_data(spaceranger_dir, snp_dir, filtergenelist_file, filterregion_file, adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal" print( adata.obs["tumor_annotation"].value_counts() ) - return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids + return adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_file, filterregion_file, normalidx_file, min_snpumis=50, min_percent_expressed_spots=0.005): @@ -151,13 +151,12 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil df_this_barcode.index = df_this_barcode.barcode # read adata count info if Path(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5").exists(): - adatatmp = sc.read_10x_h5(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5") + adatatmp = sc.read_10x_h5(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5", gex_only=False) elif Path(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad").exists(): adatatmp = sc.read_h5ad(f"{df_meta['spaceranger_dir'].iloc[i]}/filtered_feature_bc_matrix.h5ad") else: logging.error(f"{df_meta['spaceranger_dir'].iloc[i]} directory doesn't have a filtered_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5ad file!") - adatatmp.layers["count"] = adatatmp.X.A # reorder anndata spots to have the same order as df_this_barcode idx_argsort = pd.Categorical(adatatmp.obs.index, categories=list(df_this_barcode.barcode), ordered=True).argsort() adatatmp = adatatmp[idx_argsort, :] @@ -187,9 +186,9 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil adata = adatatmp else: adata = anndata.concat([adata, adatatmp], join="outer") - # replace nan with 0 - adata.layers["count"][np.isnan(adata.layers["count"])] = 0 - adata.layers["count"] = adata.layers["count"].astype(int) + # # replace nan with 0 + # adata.layers["count"][np.isnan(adata.layers["count"])] = 0 + # adata.layers["count"] = adata.layers["count"].astype(int) # shared barcodes between adata and SNPs shared_barcodes = set(list(snp_barcodes.barcodes)) & set(list(adata.obs.index)) @@ -228,7 +227,7 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil across_slice_adjacency_mat += across_slice_adjacency_mat.T # filter out spots with too small number of UMIs - indicator = (np.sum(adata.layers["count"], axis=1) >= min_snpumis) + indicator = (np.sum(adata.X, axis=1).A.flatten() >= min_snpumis) adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] @@ -248,14 +247,14 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] print(adata) - print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.X, axis=1).A.flatten()) )) if not filtergenelist_file is None: filter_gene_list = pd.read_csv(filtergenelist_file, header=None) filter_gene_list = set(list( filter_gene_list.iloc[:,0] )) indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ]) adata = adata[:, indicator_filter] - print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.X, axis=1).A.flatten()) )) if not filterregion_file is None: regions = pd.read_csv(filterregion_file, header=None, sep="\t", names=["Chrname", "Start", "End"]) @@ -278,8 +277,8 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil unique_snp_ids = unique_snp_ids[indicator_filter] clf = LocalOutlierFactor(n_neighbors=200) - label = clf.fit_predict( np.sum(adata.layers["count"], axis=0).reshape(-1,1) ) - adata.layers["count"][:, np.where(label==-1)[0]] = 0 + label = clf.fit_predict( np.sum(adata.X, axis=0).A.flatten().reshape(-1,1) ) + adata.X[:, np.where(label==-1)[0]] = 0 print("filter out {} outlier genes.".format( np.sum(label==-1) )) if not normalidx_file is None: @@ -288,7 +287,7 @@ def load_joint_data(input_filelist, snp_dir, alignment_files, filtergenelist_fil adata.obs["tumor_annotation"][adata.obs.index.isin(normal_barcodes)] = "normal" print( adata.obs["tumor_annotation"].value_counts() ) - return adata, cell_snp_Aallele.A, cell_snp_Ballele.A, unique_snp_ids, across_slice_adjacency_mat + return adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat def load_slidedna_data(snp_dir, bead_file, filterregion_bedfile): @@ -344,7 +343,7 @@ def taking_shared_barcodes(snp_barcodes, cell_snp_Aallele, cell_snp_Ballele, ada def filter_genes_barcodes_hatchetblock(adata, cell_snp_Aallele, cell_snp_Ballele, snp_barcodes, unique_snp_ids, config, min_umi=100, min_spot_percent=0.005, ordered_chr=[str(c) for c in range(1,23)]): # filter out spots with too small number of UMIs - indicator = (np.sum(adata.layers["count"], axis=1) > min_umi) + indicator = (np.sum(adata.X, axis=1).A.flatten() > min_umi) adata = adata[indicator, :] cell_snp_Aallele = cell_snp_Aallele[indicator, :] cell_snp_Ballele = cell_snp_Ballele[indicator, :] @@ -354,14 +353,14 @@ def filter_genes_barcodes_hatchetblock(adata, cell_snp_Aallele, cell_snp_Ballele genenames = set(list(adata.var.index[indicator])) adata = adata[:, indicator] print(adata) - print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print("median UMI after filtering out genes < 0.5% of cells = {}".format( np.median(np.sum(adata.X, axis=1).A.flatten()) )) if not config["filtergenelist_file"] is None: filter_gene_list = pd.read_csv(config["filtergenelist_file"], header=None) filter_gene_list = set(list( filter_gene_list.iloc[:,0] )) indicator_filter = np.array([ (not x in filter_gene_list) for x in adata.var.index ]) adata = adata[:, indicator_filter] - print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.layers["count"], axis=1)) )) + print("median UMI after filtering out genes in filtergenelist_file = {}".format( np.median(np.sum(adata.X, axis=1).A.flatten()) )) if not config["filterregion_file"] is None: regions = pd.read_csv(config["filterregion_file"], header=None, sep="\t", names=["Chrname", "Start", "End"]) @@ -517,27 +516,54 @@ def combine_gene_snps(unique_snp_ids, hgtable_file, adata): # check the what gene each SNP belongs to # for each SNP (with not null snp_id), find the previous gene (is_interval == True) such that the SNP start position is within the gene start and end interval - vec_is_interval = df_gene_snp.is_interval.values + # vector of chr, start, and end for genes + gene_names = df_gene_snp[df_gene_snp.is_interval].gene.values + gene_chr = df_gene_snp[df_gene_snp.is_interval].CHR.values + gene_start = df_gene_snp[df_gene_snp.is_interval].START.values + gene_end = df_gene_snp[df_gene_snp.is_interval].END.values + # vector of chr, start for all vec_chr = df_gene_snp.CHR.values vec_start = df_gene_snp.START.values - vec_end = df_gene_snp.END.values + vec_genes = df_gene_snp.gene.values + last_j = 0 for i in np.where(df_gene_snp.gene.isnull())[0]: if i == 0: continue this_pos = vec_start[i] - j = i - 1 - while j >= 0 and j >= i-50 and vec_chr[i] == vec_chr[j]: - if vec_is_interval[j] and vec_start[j] <= this_pos and vec_end[j] > this_pos: - df_gene_snp.iloc[i, 4] = df_gene_snp.iloc[j]["gene"] + last_j = next((j for j in range(last_j, len(gene_chr)) if gene_chr[j] > vec_chr[i] or (gene_chr[j] == vec_chr[i] and gene_end[j] > this_pos)), len(gene_chr)) + j = last_j + while j < len(gene_chr) and gene_chr[j] == vec_chr[i] and gene_start[j] <= this_pos: + if gene_chr[j] == vec_chr[i] and gene_start[j] <= this_pos and gene_end[j] > this_pos: + vec_genes[i] = gene_names[j] + last_j = j break - j -= 1 + j += 1 + + df_gene_snp["gene"] = vec_genes + + # # check the what gene each SNP belongs to + # # for each SNP (with not null snp_id), find the previous gene (is_interval == True) such that the SNP start position is within the gene start and end interval + # vec_is_interval = df_gene_snp.is_interval.values + # vec_chr = df_gene_snp.CHR.values + # vec_start = df_gene_snp.START.values + # vec_end = df_gene_snp.END.values + # for i in np.where(df_gene_snp.gene.isnull())[0]: + # if i == 0: + # continue + # this_pos = vec_start[i] + # j = i - 1 + # while j >= 0 and j >= i-50 and vec_chr[i] == vec_chr[j]: + # if vec_is_interval[j] and vec_start[j] <= this_pos and vec_end[j] > this_pos: + # df_gene_snp.iloc[i, 4] = df_gene_snp.iloc[j]["gene"] + # break + # j -= 1 # remove SNPs that have no corresponding genes - df_gene_snp = df_gene_snp[~df_gene_snp.gene.isnull()] + # df_gene_snp = df_gene_snp[~df_gene_snp.gene.isnull()] return df_gene_snp -def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, initial_min_umi=15): +def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, htblock_min_snps=1, htblock_min_umi=15): """ Initially block SNPs along genome. @@ -547,7 +573,8 @@ def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp Gene and SNP info combined into a single data frame sorted by genomic positions. "is_interval" suggest whether the entry is a gene or a SNP. "gene" column either contain gene name if the entry is a gene, or the gene a SNP belongs to if the entry is a SNP. """ # first level: partition of genome: by gene regions (if two genes overlap, they are grouped to one region) - tmp_block_genome_intervals = list(zip( df_gene_snp[df_gene_snp.is_interval].CHR.values, df_gene_snp[df_gene_snp.is_interval].START.values, df_gene_snp[df_gene_snp.is_interval].END.values )) + # tmp_block_genome_intervals = list(zip( df_gene_snp[df_gene_snp.is_interval].CHR.values, df_gene_snp[df_gene_snp.is_interval].START.values, df_gene_snp[df_gene_snp.is_interval].END.values )) + tmp_block_genome_intervals = list(zip( df_gene_snp.CHR.values, df_gene_snp.START.values, df_gene_snp.END.values )) block_genome_intervals = [tmp_block_genome_intervals[0]] for x in tmp_block_genome_intervals[1:]: # check whether overlap with previous block @@ -557,57 +584,106 @@ def create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp block_genome_intervals.append(x) # get block_ranges in the index of df_gene_snp block_ranges = [] + last_s = 0 + chr_start_end = list(zip(df_gene_snp.CHR.values, df_gene_snp.START.values, df_gene_snp.END.values)) for x in block_genome_intervals: - indexes = np.where((df_gene_snp.CHR.values == x[0]) & \ - (np.maximum(df_gene_snp.START.values, x[1]) < np.minimum(df_gene_snp.END.values, x[2])) )[0] - block_ranges.append( (indexes[0], indexes[-1]+1) ) - assert np.all( np.array(np.array([x[1] for x in block_ranges[:-1]])) == np.array(np.array([x[0] for x in block_ranges[1:]])) ) + t = next((i for i in range(last_s, len(chr_start_end)) if chr_start_end[i][0] > x[0] or chr_start_end[i][1] >= x[2]), len(chr_start_end)) + block_ranges.append( (last_s, t) ) + last_s = t + # for x in block_genome_intervals: + # indexes = np.where((df_gene_snp.CHR.values == x[0]) & \ + # (np.maximum(df_gene_snp.START.values, x[1]) < np.minimum(df_gene_snp.END.values, x[2])) )[0] + # block_ranges.append( (indexes[0], indexes[-1]+1) ) + # assert np.all( np.array(np.array([x[1] for x in block_ranges[:-1]])) == np.array(np.array([x[0] for x in block_ranges[1:]])) ) # record the initial block id in df_gene_snps - df_gene_snp["initial_block_id"] = 0 + initial_block_id = np.zeros(df_gene_snp.shape[0], dtype=int) for i,x in enumerate(block_ranges): - df_gene_snp.iloc[x[0]:x[1], -1] = i - - # second level: group the first level blocks into haplotype blocks such that the minimum SNP-covering UMI counts >= initial_min_umi - map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)} - initial_block_chr = df_gene_snp.CHR.values[ np.array([x[0] for x in block_ranges]) ] + initial_block_id[x[0]:x[1]] = i + if not np.all(initial_block_id[:-1] <= initial_block_id[1:]): + i = np.where(initial_block_id[:-1] > initial_block_id[1:])[0][0] + logger.error(f"Not all genes/SNPs are assigned to initial blocks properly. {df_gene_snp.iloc[i:(i+2),:]}, {initial_block_id[i:(i+2)]}") + df_gene_snp["initial_block_id"] = initial_block_id + + # second level: group the first level blocks into haplotype blocks such that the minimum SNP-covering UMI counts >= htblock_min_umi + snpumi = np.zeros(df_gene_snp.shape[0], dtype=int) + snpumi[~df_gene_snp.is_interval] = np.sum(cell_snp_Aallele, axis=0).A.flatten() + np.sum(cell_snp_Ballele, axis=0).A.flatten() + cumsum_snpumi = np.append(0, np.cumsum(snpumi)) + count_num_snps = np.append(0, np.cumsum((~df_gene_snp.is_interval).values)) block_ranges_new = [] s = 0 - while s < len(block_ranges): - t = s - while t <= len(block_ranges): - t += 1 - reach_end = (t == len(block_ranges)) - change_chr = (initial_block_chr[s] != initial_block_chr[t-1]) - # count SNP-covering UMI - involved_snps_ids = df_gene_snp[ (df_gene_snp.initial_block_id>=s) & (df_gene_snp.initial_block_id= t_initial_block), df_gene_snp.shape[0]) + while t_initial_block <= len(block_ranges): + # t = s + np.where(initial_block_id[s:] < t_initial_block)[0][-1] + 1 + reach_end = (t_initial_block == len(block_ranges)) + change_chr = (df_gene_snp.CHR.values[s] != df_gene_snp.CHR.values[t-1]) + # check snp UMIs + this_snp_umis = cumsum_snpumi[t] - cumsum_snpumi[s] + this_num_snps = count_num_snps[t] - count_num_snps[s] if reach_end: break if change_chr: - t -= 1 - # re-count SNP-covering UMIs - involved_snps_ids = df_gene_snp.snp_id.iloc[block_ranges[s][0]:block_ranges[t-1][1]] - involved_snps_ids = involved_snps_ids[~involved_snps_ids.isnull()].values - involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids]) - this_snp_umis = 0 if len(involved_snp_idx) == 0 else np.sum( cell_snp_Aallele[:, involved_snp_idx]) + np.sum(cell_snp_Ballele[:, involved_snp_idx]) + t_initial_block -= 1 + # t = np.where(initial_block_id < t_initial_block)[0][-1] + 1 + t = next((s + i for i in range(len(initial_block_id[s:])) if initial_block_id[s+i] >= t_initial_block), df_gene_snp.shape[0]) + this_snp_umis = cumsum_snpumi[t] - cumsum_snpumi[s] + this_num_snps = count_num_snps[t] - count_num_snps[s] break - if this_snp_umis >= initial_min_umi: + if this_snp_umis >= htblock_min_umi and this_num_snps >= htblock_min_snps: break + t_initial_block += 1 + t = next((s + i for i in range(len(initial_block_id[s:])) if initial_block_id[s+i] >= t_initial_block), df_gene_snp.shape[0]) # - if this_snp_umis < initial_min_umi and s > 0 and initial_block_chr[s-1] == initial_block_chr[s]: - indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0] - block_ranges_new[-1] = (block_ranges_new[-1][0], indexes[-1]+1) + if this_snp_umis < htblock_min_umi and s > 0 and df_gene_snp.CHR.values[s-1] == df_gene_snp.CHR.values[s]: + block_ranges_new[-1] = (block_ranges_new[-1][0], t) else: - indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0] - block_ranges_new.append( (indexes[0], indexes[-1]+1) ) + block_ranges_new.append( (s, t) ) s = t + + # map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)} + # initial_block_chr = df_gene_snp.CHR.values[ np.array([x[0] for x in block_ranges]) ] + # block_ranges_new = [] + # s = 0 + # while s < len(block_ranges): + # t = s + # while t <= len(block_ranges): + # t += 1 + # reach_end = (t == len(block_ranges)) + # change_chr = (initial_block_chr[s] != initial_block_chr[t-1]) + # # count SNP-covering UMI + # involved_snps_ids = df_gene_snp[ (df_gene_snp.initial_block_id>=s) & (df_gene_snp.initial_block_id= htblock_min_umi: + # break + # # + # if this_snp_umis < htblock_min_umi and s > 0 and initial_block_chr[s-1] == initial_block_chr[s]: + # indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0] + # block_ranges_new[-1] = (block_ranges_new[-1][0], indexes[-1]+1) + # else: + # indexes = np.where(df_gene_snp.initial_block_id.isin(np.arange(s, t)))[0] + # block_ranges_new.append( (indexes[0], indexes[-1]+1) ) + # s = t # record the block id in df_gene_snps - df_gene_snp["block_id"] = 0 + block_id = np.zeros(df_gene_snp.shape[0], dtype=int) for i,x in enumerate(block_ranges_new): - df_gene_snp.iloc[x[0]:x[1], -1] = i + block_id[x[0]:x[1]] = i + df_gene_snp["block_id"] = block_id df_gene_snp = df_gene_snp.drop(columns=["initial_block_id"]) return df_gene_snp @@ -625,36 +701,35 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B lengths : array, (n_chromosomes,) Number of blocks per chromosome. - single_X : array, (n_blocks, 2, n_spots) - Transcript counts and B allele count per block per cell. + sp_single_X_rdr : array, (n_blocks, n_spots) + Transcript counts per block per cell in csr matrix. - single_base_nb_mean : array, (n_blocks, n_spots) - Baseline transcript counts in normal diploid per block per cell. + sp_single_X_b : array, (n_blocks, n_spots) + B allele count per block per cell. - single_total_bb_RD : array, (n_blocks, n_spots) + sp_single_total_bb_RD : array, (n_blocks, n_spots) Total allele count per block per cell. log_sitewise_transmat : array, (n_blocks,) Log phase switch probability between each pair of adjacent blocks. """ blocks = df_gene_snp.block_id.unique() - single_X = np.zeros((len(blocks), 2, adata.shape[0]), dtype=int) - single_base_nb_mean = np.zeros((len(blocks), adata.shape[0])) - single_total_bb_RD = np.zeros((len(blocks), adata.shape[0]), dtype=int) - # summarize counts of involved genes and SNPs within each block - map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)} - df_block_contents = df_gene_snp.groupby('block_id').agg({"snp_id":list, "gene":list}) - for b in range(df_block_contents.shape[0]): - # BAF (SNPs) - involved_snps_ids = [x for x in df_block_contents.snp_id.values[b] if not x is None] - involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids]) - if len(involved_snp_idx) > 0: - single_X[b, 1, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ) - single_total_bb_RD[b, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ) + np.sum( cell_snp_Ballele[:, involved_snp_idx], axis=1 ) - # RDR (genes) - involved_genes = list(set([x for x in df_block_contents.gene.values[b] if not x is None])) - if len(involved_genes) > 0: - single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + + # transcript count + adata.var['row_num'] = np.arange(adata.shape[1]) + idx_row = adata.var.loc[df_gene_snp[df_gene_snp.is_interval].gene.values].row_num + idx_col = df_gene_snp[df_gene_snp.is_interval].block_id.values + rdr_mul = scipy.sparse.csr_matrix(( np.ones(len(idx_row)), (idx_row, idx_col) ), shape=(adata.shape[1], len(blocks))) + sp_single_X_rdr = (adata.X @ rdr_mul).T + adata.var.drop(columns=['row_num'], inplace=True) + + # B count and total count + idx_row = np.arange(cell_snp_Aallele.shape[1]) + idx_col = df_gene_snp[~df_gene_snp.is_interval].block_id.values + assert len(idx_row) == len(idx_col) + baf_mul = scipy.sparse.csr_matrix(( np.ones(len(idx_row)), (idx_row, idx_col) ), shape=(cell_snp_Aallele.shape[1], len(blocks))) + sp_single_X_b = (cell_snp_Aallele @ baf_mul).T + sp_single_total_bb_RD = ((cell_snp_Aallele + cell_snp_Ballele) @ baf_mul).T # lengths lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int) @@ -666,7 +741,7 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)) sorted_chr_pos_last = df_gene_snp.groupby('block_id').agg({'CHR': 'last', 'END': 'last'}) sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)) - # + tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair] position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file ) phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu) @@ -674,7 +749,71 @@ def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_B # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)] log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)] - return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat + return lengths, sp_single_X_rdr, sp_single_X_b, sp_single_total_bb_RD, log_sitewise_transmat + + +# def summarize_counts_for_blocks(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu, logphase_shift, geneticmap_file): +# """ +# Attributes: +# ---------- +# df_gene_snp : pd.DataFrame +# Contain "block_id" column to indicate which genes/snps belong to which block. + +# Returns +# ---------- +# lengths : array, (n_chromosomes,) +# Number of blocks per chromosome. + +# single_X : array, (n_blocks, 2, n_spots) +# Transcript counts and B allele count per block per cell. + +# single_base_nb_mean : array, (n_blocks, n_spots) +# Baseline transcript counts in normal diploid per block per cell. + +# single_total_bb_RD : array, (n_blocks, n_spots) +# Total allele count per block per cell. + +# log_sitewise_transmat : array, (n_blocks,) +# Log phase switch probability between each pair of adjacent blocks. +# """ +# blocks = df_gene_snp.block_id.unique() +# single_X = np.zeros((len(blocks), 2, adata.shape[0]), dtype=int) +# single_base_nb_mean = np.zeros((len(blocks), adata.shape[0])) +# single_total_bb_RD = np.zeros((len(blocks), adata.shape[0]), dtype=int) +# # summarize counts of involved genes and SNPs within each block +# map_snp_index = {x:i for i,x in enumerate(unique_snp_ids)} +# df_block_contents = df_gene_snp.groupby('block_id').agg({"snp_id":list, "gene":list}) +# for b in range(df_block_contents.shape[0]): +# # BAF (SNPs) +# involved_snps_ids = [x for x in df_block_contents.snp_id.values[b] if not x is None] +# involved_snp_idx = np.array([map_snp_index[x] for x in involved_snps_ids]) +# if len(involved_snp_idx) > 0: +# single_X[b, 1, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ).A.flatten() +# single_total_bb_RD[b, :] = np.sum( cell_snp_Aallele[:, involved_snp_idx], axis=1 ).A.flatten() + np.sum( cell_snp_Ballele[:, involved_snp_idx], axis=1 ).A.flatten() +# # RDR (genes) +# involved_genes = list(set([x for x in df_block_contents.gene.values[b] if not x is None])) +# if len(involved_genes) > 0: +# single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + +# # lengths +# lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int) +# for i,c in enumerate( df_gene_snp.CHR.unique() ): +# lengths[i] = len( df_gene_snp[df_gene_snp.CHR == c].block_id.unique() ) + +# # phase switch probability from genetic distance +# sorted_chr_pos_first = df_gene_snp.groupby('block_id').agg({'CHR': 'first', 'START': 'first'}) +# sorted_chr_pos_first = list(zip(sorted_chr_pos_first.CHR.values, sorted_chr_pos_first.START.values)) +# sorted_chr_pos_last = df_gene_snp.groupby('block_id').agg({'CHR': 'last', 'END': 'last'}) +# sorted_chr_pos_last = list(zip(sorted_chr_pos_last.CHR.values, sorted_chr_pos_last.END.values)) +# # +# tmp_sorted_chr_pos = [val for pair in zip(sorted_chr_pos_first, sorted_chr_pos_last) for val in pair] +# position_cM = get_position_cM_table( tmp_sorted_chr_pos, geneticmap_file ) +# phase_switch_prob = compute_phase_switch_probability_position(position_cM, tmp_sorted_chr_pos, nu) +# log_sitewise_transmat = np.minimum(np.log(0.5), np.log(phase_switch_prob) - logphase_shift) +# # log_sitewise_transmat = log_sitewise_transmat[np.arange(0, len(log_sitewise_transmat), 2)] +# log_sitewise_transmat = log_sitewise_transmat[np.arange(1, len(log_sitewise_transmat), 2)] + +# return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat def choose_umithreshold_given_nbins(single_total_bb_RD, refined_lengths, expected_nbins): @@ -906,7 +1045,8 @@ def greedy_binning_nobreak(block_lengths, block_umi, secondary_min_umi, max_binl return df_gene_snp -def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, phase_indicator, nu, logphase_shift, geneticmap_file): +# def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, phase_indicator, nu, logphase_shift, geneticmap_file): +def summarize_counts_for_bins(df_gene_snp, sp_single_X_rdr, sp_single_X_b, sp_single_total_bb_RD, phase_indicator, nu, logphase_shift, geneticmap_file): """ Attributes: ---------- @@ -930,21 +1070,43 @@ def summarize_counts_for_bins(df_gene_snp, adata, single_X, single_total_bb_RD, log_sitewise_transmat : array, (n_blocks,) Log phase switch probability between each pair of adjacent blocks. """ - bins = df_gene_snp.bin_id.unique() - bin_single_X = np.zeros((len(bins), 2, adata.shape[0]), dtype=int) - bin_single_base_nb_mean = np.zeros((len(bins), adata.shape[0])) - bin_single_total_bb_RD = np.zeros((len(bins), adata.shape[0]), dtype=int) - # summarize counts of involved genes and SNPs within each block - df_bin_contents = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"block_id":set, "gene":set}) - for b in range(df_bin_contents.shape[0]): - # BAF (SNPs) - involved_blocks = [x for x in df_bin_contents.block_id.values[b] if not x is None] - this_phased = np.where(phase_indicator[involved_blocks].reshape(-1,1), single_X[involved_blocks, 1, :], single_total_bb_RD[involved_blocks, :] - single_X[involved_blocks, 1, :]) - bin_single_X[b, 1, :] = np.sum(this_phased, axis=0) - bin_single_total_bb_RD[b, :] = np.sum( single_total_bb_RD[involved_blocks, :], axis=0 ) - # RDR (genes) - involved_genes = [x for x in df_bin_contents.gene.values[b] if not x is None] - bin_single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + # bins = df_gene_snp.bin_id.unique() + # bin_single_X = np.zeros((len(bins), 2, adata.shape[0]), dtype=int) + # bin_single_base_nb_mean = np.zeros((len(bins), adata.shape[0])) + # bin_single_total_bb_RD = np.zeros((len(bins), adata.shape[0]), dtype=int) + # # summarize counts of involved genes and SNPs within each block + # df_bin_contents = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"block_id":set, "gene":set}) + # for b in range(df_bin_contents.shape[0]): + # # BAF (SNPs) + # involved_blocks = [x for x in df_bin_contents.block_id.values[b] if not x is None] + # this_phased = np.where(phase_indicator[involved_blocks].reshape(-1,1), single_X[involved_blocks, 1, :], single_total_bb_RD[involved_blocks, :] - single_X[involved_blocks, 1, :]) + # bin_single_X[b, 1, :] = np.sum(this_phased, axis=0) + # bin_single_total_bb_RD[b, :] = np.sum( single_total_bb_RD[involved_blocks, :], axis=0 ) + # # RDR (genes) + # involved_genes = [x for x in df_bin_contents.gene.values[b] if not x is None] + # bin_single_X[b, 0, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + + # incorporate phase indicator into B allele count and generate phased_sp_single_X_b + # collect all nonzero entries in sp_single_total_bb_RD, collect the corresponding B count from sp_single_X_b + df_allele_count = pd.DataFrame({'row':sp_single_total_bb_RD.nonzero()[0], + 'col':sp_single_total_bb_RD.nonzero()[1], + 'val':sp_single_total_bb_RD[sp_single_total_bb_RD.nonzero()].A.flatten()}) + df_allele_count['unphased_B'] = sp_single_X_b[ (df_allele_count.row, df_allele_count.col) ].A.flatten() + # add phase indicator information + df_allele_count['phase_indicator'] = phase_indicator[ df_allele_count.row.values ] + # get phased B count + df_allele_count['phased_B'] = np.where(df_allele_count.phase_indicator, df_allele_count.unphased_B, df_allele_count.val - df_allele_count.unphased_B) + # create phased_sp_single_X_b + phased_sp_single_X_b = scipy.sparse.csr_matrix((df_allele_count.phased_B.values, (df_allele_count.row.values, df_allele_count.col.values)), shape=sp_single_X_b.shape) + + n_bins = len(df_gene_snp.bin_id.unique()) + N = sp_single_X_rdr.shape[1] + mul = scipy.sparse.csr_matrix((np.ones(df_gene_snp.shape[0]), (df_gene_snp.bin_id, df_gene_snp.block_id)) ) + bin_single_X = np.zeros((n_bins, 2, N)) + bin_single_X[:,0,:] = (mul @ sp_single_X_rdr).toarray() + bin_single_X[:,1,:] = (mul @ phased_sp_single_X_b).toarray() + bin_single_total_bb_RD = (mul @ sp_single_total_bb_RD).toarray() + bin_single_base_nb_mean = np.zeros(bin_single_total_bb_RD.shape) # lengths lengths = np.zeros(len(df_gene_snp.CHR.unique()), dtype=int) @@ -1018,77 +1180,19 @@ def bin_selection_basedon_normal(df_gene_snp, single_X, single_base_nb_mean, sin return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp -def filter_de_genes(exp_counts, x_gene_list, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold=4, quantile_threshold=80): - adata = anndata.AnnData(exp_counts) - adata.layers["count"] = exp_counts.values - adata.obs["normal_candidate"] = normal_candidate - # - map_gene_adatavar = {} - map_gene_umi = {} - list_gene_umi = np.sum(adata.layers["count"], axis=0) - for i,x in enumerate(adata.var.index): - map_gene_adatavar[x] = i - map_gene_umi[x] = list_gene_umi[i] - # - if sample_list is None: - sample_list = [None] - # - filtered_out_set = set() - for s,sname in enumerate(sample_list): - if sname is None: - index = np.arange(adata.shape[0]) - else: - index = np.where(sample_ids == s)[0] - tmpadata = adata[index, :].copy() - # - umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold ) - # - sc.pp.filter_cells(tmpadata, min_genes=200) - sc.pp.filter_genes(tmpadata, min_cells=10) - med = np.median( np.sum(tmpadata.layers["count"], axis=1) ) - # sc.pp.normalize_total(tmpadata, target_sum=1e4) - sc.pp.normalize_total(tmpadata, target_sum=med) - sc.pp.log1p(tmpadata) - # new added - sc.pp.pca(tmpadata, n_comps=4) - kmeans = KMeans(n_clusters=2, random_state=0).fit(tmpadata.obsm["X_pca"]) - kmeans_labels = kmeans.predict(tmpadata.obsm["X_pca"]) - idx_kmeans_label = np.argmax(np.bincount( kmeans_labels[tmpadata.obs["normal_candidate"]], minlength=2 )) - clone = np.array(["normal"] * tmpadata.shape[0]) - clone[ (kmeans_labels != idx_kmeans_label) & (~tmpadata.obs["normal_candidate"]) ] = "tumor" - tmpadata.obs["clone"] = clone - # end added - sc.tl.rank_genes_groups(tmpadata, 'clone', groups=["tumor"], reference="normal", method='wilcoxon') - genenames = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["names"] ]) - logfc = np.array([ x[0] for x in tmpadata.uns["rank_genes_groups"]["logfoldchanges"] ]) - geneumis = np.array([ map_gene_umi[x] for x in genenames]) - this_filtered_out_set = set(list(genenames[ (np.abs(logfc) > logfcthreshold) & (geneumis > umi_threshold) ])) - filtered_out_set = filtered_out_set | this_filtered_out_set - print(f"Filter out {len(filtered_out_set)} DE genes") - # - new_single_X_rdr = np.zeros((len(x_gene_list), adata.shape[0])) - for i,x in enumerate(x_gene_list): - g_list = [z for z in x.split() if z != ""] - idx_genes = np.array([ map_gene_adatavar[g] for g in g_list if (not g in filtered_out_set) and (g in map_gene_adatavar)]) - if len(idx_genes) > 0: - new_single_X_rdr[i, :] = np.sum(adata.layers["count"][:, idx_genes], axis=1) - return new_single_X_rdr, filtered_out_set - -def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold_u=2, logfcthreshold_t=4, quantile_threshold=80): +def filter_de_genes_tri(adata, df_bininfo, normal_candidate, sample_list=None, sample_ids=None, logfcthreshold_u=2, logfcthreshold_t=4, quantile_threshold=80): """ Attributes ---------- df_bininfo : pd.DataFrame Contains columns ['CHR', 'START', 'END', 'INCLUDED_GENES', 'INCLUDED_SNP_IDS'], 'INCLUDED_GENES' contains space-delimited gene names. """ - adata = anndata.AnnData(exp_counts) - adata.layers["count"] = exp_counts.values adata.obs["normal_candidate"] = normal_candidate # map_gene_adatavar = {} map_gene_umi = {} - list_gene_umi = np.sum(adata.layers["count"], axis=0) + list_gene_umi = np.sum(adata.X, axis=0).A.flatten() for i,x in enumerate(adata.var.index): map_gene_adatavar[x] = i map_gene_umi[x] = list_gene_umi[i] @@ -1103,14 +1207,14 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No else: index = np.where(sample_ids == s)[0] tmpadata = adata[index, :].copy() - if np.sum(tmpadata.layers["count"][tmpadata.obs["normal_candidate"], :]) < tmpadata.shape[1] * 10: + if np.sum(tmpadata.X[tmpadata.obs["normal_candidate"], :]) < tmpadata.shape[1] * 10: continue # - umi_threshold = np.percentile( np.sum(tmpadata.layers["count"], axis=0), quantile_threshold ) + umi_threshold = np.percentile( np.sum(tmpadata.X, axis=0).A.flatten(), quantile_threshold ) # # sc.pp.filter_cells(tmpadata, min_genes=200) sc.pp.filter_genes(tmpadata, min_cells=10) - med = np.median( np.sum(tmpadata.layers["count"], axis=1) ) + med = np.median( np.sum(tmpadata.X, axis=1).A.flatten() ) # sc.pp.normalize_total(tmpadata, target_sum=1e4) sc.pp.normalize_total(tmpadata, target_sum=med) sc.pp.log1p(tmpadata) @@ -1136,7 +1240,7 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No # geneumis_u = np.array([ map_gene_umi[x] for x in genenames_u]) # this_filtered_out_set = set(list(genenames_t[ (np.abs(logfc_t) > logfcthreshold) & (geneumis_t > umi_threshold) ])) | set(list(genenames_u[ (np.abs(logfc_u) > logfcthreshold) & (geneumis_u > umi_threshold) ])) # - agg_counts = np.vstack([ np.sum(tmpadata.layers["count"][tmpadata.obs['clone']==c,:], axis=0) for c in ['normal', 'unsure', 'tumor'] ]) + agg_counts = np.vstack([ np.sum(tmpadata.X[tmpadata.obs['clone']==c,:], axis=0).A.flatten() for c in ['normal', 'unsure', 'tumor'] ]) agg_counts = agg_counts / np.sum(agg_counts, axis=1, keepdims=True) * 1e6 geneumis = np.array([ map_gene_umi[x] for x in tmpadata.var.index]) logfc_u = np.where( ((agg_counts[1,:]==0) | (agg_counts[0,:]==0)), 10, np.log2(agg_counts[1,:] / agg_counts[0,:]) ) @@ -1150,7 +1254,7 @@ def filter_de_genes_tri(exp_counts, df_bininfo, normal_candidate, sample_list=No for b,genestr in enumerate(df_bininfo.INCLUDED_GENES.values): # RDR (genes) involved_genes = set(genestr.split(" ")) - filtered_out_set - new_single_X_rdr[b, :] = np.sum( adata.layers['count'][:, adata.var.index.isin(involved_genes)], axis=1 ) + new_single_X_rdr[b, :] = np.sum( adata.X[:, adata.var.index.isin(involved_genes)], axis=1 ).A.flatten() return new_single_X_rdr, filtered_out_set diff --git a/src/calicost/utils_hmrf.py b/src/calicost/utils_hmrf.py index bee9f42..e044e1f 100644 --- a/src/calicost/utils_hmrf.py +++ b/src/calicost/utils_hmrf.py @@ -99,7 +99,7 @@ def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7): coords : array, shape (n_spots, 2) Spatial coordinates of spots. - exp_counts : None or array, shape (n_spots, n_genes) + exp_counts : sparse matrix, shape (n_spots, n_genes) Expression counts of spots. w : float @@ -114,8 +114,8 @@ def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7): pair_exp_dist = scipy.sparse.csr_matrix( np.zeros((n_spots,n_spots)) ) scaling_factor = 1 if not exp_counts is None: - adata = anndata.AnnData( pd.DataFrame(exp_counts) ) - sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) ) + adata = anndata.AnnData( exp_counts ) + sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts,axis=1).A.flatten()) ) sc.pp.log1p(adata) sc.tl.pca(adata) pair_exp_dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(adata.obsm["X_pca"])) @@ -140,6 +140,55 @@ def choose_adjacency_by_KNN(coords, exp_counts=None, w=1, maxspots_pooling=7): return smooth_mat, adjacency_mat +def choose_adjacency_by_triangulation(coords, maxspots_pooling=7): + """ + Create adjacency matrix by delauney triangulation, and then expand the adjacency matrix by allow 2-step and 3-step edges until the number of adjacent spots gets close to maxspots_pooling. + + Attributes + ---------- + coords : array, shape (n_spots, 2) + Spatial coordinates of spots. + + maxspots_pooling : int + Maximum number of adjacent spots. + """ + # delauney triangulation + delaunay = scipy.spatial.Delaunay(coords) + num_points = coords.shape[0] + + # Use a set to store edges to avoid duplicates + edges_set = set() + for simplex in delaunay.simplices: + for i in range(len(simplex)): + for j in range(i + 1, len(simplex)): + edges_set.add(tuple(sorted((simplex[i], simplex[j])))) + + # Convert the set of edges to a list of tuples + edges = list(edges_set) + + # Create row and column indices for the adjacency matrix + row_ind = [edge[0] for edge in edges] + col_ind = [edge[1] for edge in edges] + + # Create the adjacency matrix in CSR format + adjacency_matrix = scipy.sparse.csr_matrix((np.ones(len(edges)), (row_ind, col_ind)), shape=(num_points, num_points)) + + # make sure the adjacency matrix is symmetric + adjacency_matrix = (adjacency_matrix + adjacency_matrix.T).astype(bool) + + # Make the adjacency matrix symmetric (undirected graph) + adjacency_matrix = adjacency_matrix + adjacency_matrix.T + adjacency_matrix = (adjacency_matrix > 0).astype(int) # Ensure values are 0 or 1 + + # Expand the adjacency matrix by allow 2-step, 3-step, and more edges + expand_adjacency_matrix = (adjacency_matrix @ adjacency_matrix).astype(bool) + while np.median(np.sum(expand_adjacency_matrix, axis=0).A1) < maxspots_pooling: + adjacency_matrix = expand_adjacency_matrix + expand_adjacency_matrix = (expand_adjacency_matrix @ adjacency_matrix).astype(bool) + + return adjacency_matrix, expand_adjacency_matrix - adjacency_matrix + + def choose_adjacency_by_readcounts_slidedna(coords, maxspots_pooling=30): """ Merge spots such that 95% quantile of read count per SNP per spot exceed count_threshold. @@ -157,24 +206,27 @@ def multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, ex index = np.where(sample_ids == i)[0] this_coords = np.array(coords[index,:]) if construct_adjacency_method == "hexagon": - tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts(this_coords, single_total_bb_RD[:,index], maxspots_pooling=maxspots_pooling) + try: + tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts(this_coords, single_total_bb_RD[:,index], maxspots_pooling=maxspots_pooling) + # catch memory error, and then use choose_adjacency_by_triangulation + except MemoryError: + tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_triangulation(this_coords, maxspots_pooling=maxspots_pooling) elif construct_adjacency_method == "KNN": - tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN(this_coords, exp_counts.iloc[index,:], w=construct_adjacency_w, maxspots_pooling=maxspots_pooling) + # tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN(this_coords, exp_counts.iloc[index,:], w=construct_adjacency_w, maxspots_pooling=maxspots_pooling) + tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_KNN(this_coords, exp_counts[index,:], w=construct_adjacency_w, maxspots_pooling=maxspots_pooling) else: raise("Unknown adjacency construction method") # tmpsmooth_mat, tmpadjacency_mat = choose_adjacency_by_readcounts_slidedna(this_coords, maxspots_pooling=config["maxspots_pooling"]) - adjacency_mat.append( tmpadjacency_mat.A ) - smooth_mat.append( tmpsmooth_mat.A ) - adjacency_mat = scipy.linalg.block_diag(*adjacency_mat) - adjacency_mat = scipy.sparse.csr_matrix( adjacency_mat ) + adjacency_mat.append( tmpadjacency_mat ) + smooth_mat.append( tmpsmooth_mat ) + adjacency_mat = scipy.sparse.block_diag(adjacency_mat, format='csr') if not across_slice_adjacency_mat is None: adjacency_mat += across_slice_adjacency_mat - smooth_mat = scipy.linalg.block_diag(*smooth_mat) - smooth_mat = scipy.sparse.csr_matrix( smooth_mat ) + smooth_mat = scipy.sparse.block_diag(smooth_mat, format='csr') return adjacency_mat, smooth_mat -def rectangle_initialize_initial_clone(coords, n_clones, random_state=0): +def rectangle_initialize_initial_clone(coords, n_clones, random_state=0, EPS=1e-8): """ Initialize clone assignment by partition space into p * p blocks (s.t. p * p >= n_clones), and assign each block a clone id. @@ -194,18 +246,19 @@ def rectangle_initialize_initial_clone(coords, n_clones, random_state=0): np.random.seed(random_state) p = int(np.ceil(np.sqrt(n_clones))) # partition the range of x and y axes - px = np.random.dirichlet( np.ones(p) * 10 ) - px[-1] += 1e-4 - xrange = [np.percentile(coords[:,0], 5), np.percentile(coords[:,0], 95)] - xboundary = xrange[0] + (xrange[1] - xrange[0]) * np.cumsum(px) - xboundary[-1] = np.max(coords[:,0]) + 1 - xdigit = np.digitize(coords[:,0], xboundary, right=True) - py = np.random.dirichlet( np.ones(p) * 10 ) - py[-1] += 1e-4 - yrange = [np.percentile(coords[:,1], 5), np.percentile(coords[:,1], 95)] - yboundary = yrange[0] + (yrange[1] - yrange[0]) * np.cumsum(py) - yboundary[-1] = np.max(coords[:,1]) + 1 - ydigit = np.digitize(coords[:,1], yboundary, right=True) + px = np.random.dirichlet(np.ones(p) * 10) + px[-1] -= EPS + xboundary = np.percentile(coords[:, 0], 100 * np.cumsum(px)) + xboundary[-1] = np.max(coords[:, 0]) + 1 + xdigit = np.digitize(coords[:, 0], xboundary, right=True) + ydigit = np.zeros(coords.shape[0], dtype=int) + for x in range(p): + idx_xbin = np.where(xdigit == x)[0] + py = np.random.dirichlet(np.ones(p) * 10) + py[-1] -= EPS + yboundary = np.percentile(coords[idx_xbin, 1], 100 * np.cumsum(py)) + yboundary[-1] = np.max(coords[:, 1]) + 1 + ydigit[idx_xbin] = np.digitize(coords[idx_xbin, 1], yboundary, right=True) block_id = xdigit * p + ydigit # assigning blocks to clone (note that if sqrt(n_clone) is not an integer, multiple blocks can be assigneed to one clone) # block_clone_map = np.random.randint(low=0, high=n_clones, size=p**2) @@ -646,15 +699,15 @@ def estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, pred_c ---------- 0.5 ( 1-theta ) / (theta * RDR + 1 - theta) = B_count / Total_count for each LOH state. """ - # def estimate_purity(T_loh, B_loh, rdr_values): - # features =(T_loh / 2.0 + rdr_values * B_loh - B_loh)[T_loh>0].reshape(-1,1) - # y = (T_loh / 2.0 - B_loh)[T_loh>0] - # return np.linalg.lstsq(features, y, rcond=None)[0] def estimate_purity(T_loh, B_loh, rdr_values): - idx = np.where(T_loh > 0)[0] - model = BAF_Binom(endog=B_loh[idx], exog=np.ones((len(idx),1)), weights=np.ones(len(idx)), exposure=T_loh[idx], offset=np.log(rdr_values[idx]), scaling=0.5) - res = model.fit(disp=False) - return 1.0 / (1.0 + np.exp(res.params)) + features =(T_loh / 2.0 + rdr_values * B_loh - B_loh)[T_loh>0].reshape(-1,1) + y = (T_loh / 2.0 - B_loh)[T_loh>0] + return np.linalg.lstsq(features, y, rcond=None)[0] + # def estimate_purity(T_loh, B_loh, rdr_values): + # idx = np.where(T_loh > 0)[0] + # model = BAF_Binom(endog=B_loh[idx], exog=np.ones((len(idx),1)), weights=np.ones(len(idx)), exposure=T_loh[idx], offset=np.log(rdr_values[idx]), scaling=0.5) + # res = model.fit(disp=False) + # return 1.0 / (1.0 + np.exp(res.params)) # n_obs = single_X.shape[0] n_spots = single_X.shape[2] diff --git a/utils/get_snp_matrix.py b/utils/get_snp_matrix.py index ec9eb09..c791070 100644 --- a/utils/get_snp_matrix.py +++ b/utils/get_snp_matrix.py @@ -93,14 +93,32 @@ def cell_by_gene_lefthap_counts(cellsnp_folder, eagle_folder, barcode_list): DP = DP[is_phased,:] AD = AD[is_phased,:] - # phasing - phased_AD = np.where( (df_snp.GT.values == "0|1").reshape(-1,1), AD.A, (DP-AD).A ) - phased_AD = scipy.sparse.csr_matrix(phased_AD) + # phasing (keep sparse matrix) + # make a dataframe with row, col, value corresponding to the nonzero values in DP + row, col = DP.nonzero() + df_sparse_dp = pd.DataFrame({"row":row, "col":col, "DP":DP.data}) + df_sparse_dp.index = df_sparse_dp.row.astype(str) + "_" + df_sparse_dp.col.astype(str) + # make a dataframe with row, col, value corresponding to the nonzero values in AD + row, col = AD.nonzero() + df_sparse_ad = pd.DataFrame({"row":row, "col":col, "AD":AD.data}) + df_sparse_ad.index = df_sparse_ad.row.astype(str) + "_" + df_sparse_ad.col.astype(str) + df_sparse_dp = df_sparse_dp.join(df_sparse_ad[['AD']], how="outer") + # replace the NULL values in AD with 0 + df_sparse_dp.AD = df_sparse_dp.AD.fillna(0) + df_sparse_dp.AD = df_sparse_dp.AD.astype(int) + # adjust for phasing: if df_sparse_dp.row in np.where(df_snp.GT.values != "0|1")[0], then set AD = DP - AD + df_sparse_dp['AD'] = np.where( (df_snp.GT.values != "0|1")[df_sparse_dp.row.values], df_sparse_dp.DP.values - df_sparse_dp.AD.values, df_sparse_dp.AD.values) + # re-construct the sparse matrix + phased_AD = scipy.sparse.csr_matrix((df_sparse_dp.AD.values, (df_sparse_dp.row.values, df_sparse_dp.col.values)), shape=DP.shape) + + # # phasing + # phased_AD = np.where( (df_snp.GT.values == "0|1").reshape(-1,1), AD.A, (DP-AD).A ) + # phased_AD = scipy.sparse.csr_matrix(phased_AD) # re-order based on barcode_list index = np.array([barcode_mapper[x] for x in barcode_list if x in barcode_mapper]) DP = DP[:, index] - phased_AD = phased_AD[:, index] + phased_AD = phased_AD[:, index] # returned matrix has shape (N_cells, N_snps), which is the transpose of the original matrix return (DP-phased_AD).T, phased_AD.T, df_snp.snp_id.values diff --git a/utils/subsample.py b/utils/subsample.py new file mode 100644 index 0000000..d495e25 --- /dev/null +++ b/utils/subsample.py @@ -0,0 +1,201 @@ +import sys +import numpy as np +import scipy +import pandas as pd +from pathlib import Path +import scanpy as sc +import argparse + + +def subsample_cells_multi(input_filelist, outputdir, n_cells, random_seed=0): + """ + Sub-sample cells from a list of spaceranger output directories. + + Parameters + ---------- + input_filelist : str + Path to a file containing a list of spaceranger output directories. Columns: bam, sample_id, spaceranger_dir + + outputdir : str + Directory to save the subsampled h5ad files. + + n_cells : int + Number of cells to subsample. + """ + df_meta = pd.read_csv(input_filelist, sep="\t", header=None) + df_meta.rename(columns=dict(zip( df_meta.columns[:3], ["bam", "sample_id", "spaceranger_dir"] )), inplace=True) + + # get the full list of barcodes by reading tissue_positions.csv or tissue_positions_list.csv file from each spaceranger directory + df_pos = [] + for i, spaceranger in enumerate(df_meta.spaceranger_dir.values): + pos_file = Path(spaceranger) / "spatial" / "tissue_positions.csv" + if not pos_file.exists(): + pos_file = Path(spaceranger) / "spatial" / "tissue_positions_list.csv" + this_pos = pd.read_csv(pos_file, sep=",", header=None, names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + else: + this_pos = pd.read_csv(pos_file, sep=",", header=0, names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + + this_pos = this_pos[this_pos.in_tissue == True] + this_pos['sampleid'] = df_meta.sample_id.values[i] + + this_pos.barcode = this_pos.barcode + '_' + this_pos['sampleid'] + df_pos.append(this_pos) + + df_pos = pd.concat(df_pos, ignore_index=True) + + # compute the number of cells to subsample from each sample based on the number of cells in each sample + n_cells_per_sample = np.round(n_cells * df_pos.groupby('sampleid').size() / df_pos.shape[0]).astype(int) + + # subsample cells randomly uniformly from each sample + np.random.seed(random_seed) + df_sampled = [] + for i, sampleid in enumerate(df_meta.sample_id.values): + this_sample = df_pos[df_pos.sampleid == sampleid].sample(n=n_cells_per_sample[i], replace=False) + this_sample.sort_index(inplace=True) + df_sampled.append(this_sample) + df_sampled = pd.concat(df_sampled, ignore_index=True) + + # write a new h5ad file with the subsampled cells and the corresponding tissue_positions.csv file + for i, spaceranger in enumerate(df_meta.spaceranger_dir.values): + if Path(f'{spaceranger}/filtered_feature_bc_matrix.h5').exists(): + adata = sc.read_10x_h5(f'{spaceranger}/filtered_feature_bc_matrix.h5', gex_only=False) + else: + adata = sc.read_h5ad(f'{spaceranger}/filtered_feature_bc_matrix.h5ad') + this_pos = df_sampled[df_sampled.sampleid == df_meta.sample_id.values[i]].drop(columns='sampleid') + this_pos.barcode = this_pos.barcode.str.split('_').str[0] + adata = adata[this_pos.barcode.values, :] + # write the subsampled h5ad file + # mkdir + sample_outdir = Path(outputdir) / df_meta.sample_id.values[i] + sample_outdir.mkdir(parents=True, exist_ok=True) + adata.write(sample_outdir / "filtered_feature_bc_matrix.h5ad") + # write the subsampled tissue_positions.csv file + sample_outdir2 = sample_outdir / 'spatial' + sample_outdir2.mkdir(parents=True, exist_ok=True) + this_pos.to_csv(sample_outdir2 / "tissue_positions.csv", sep=",", index=False, header=True) + + return df_sampled + + +def subsample_cells_single(spaceranger_dir, outputdir, n_cells, random_seed=0): + """ + Sub-sample cells from a single spaceranger output directory. + + Parameters + ---------- + spaceranger_dir : str + Path to the spaceranger output directory. + outputdir : str + Directory to save the subsampled h5ad files. + n_cells : int + Number of cells to subsample. + """ + # get the list of spot barcodes by reading the tissue_positions.csv or tissue_positions_list.csv file + pos_file = Path(spaceranger_dir) / "spatial" / "tissue_positions.csv" + if not pos_file.exists(): + pos_file = Path(spaceranger_dir) / "spatial" / "tissue_positions_list.csv" + df_pos = pd.read_csv(pos_file, sep=",", header=None, names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + else: + df_pos = pd.read_csv(pos_file, sep=",", header=0, names=["barcode", "in_tissue", "x", "y", "pixel_row", "pixel_col"]) + df_pos = df_pos[df_pos.in_tissue == True] + + # subsample cells randomly uniformly from each sample + np.random.seed(random_seed) + df_sampled = df_pos.sample(n=n_cells, replace=False) + df_sampled.sort_index(inplace=True) + + # write a new h5ad file with the subsampled cells and the corresponding tissue_positions.csv file + if Path(f'{spaceranger_dir}/filtered_feature_bc_matrix.h5').exists(): + adata = sc.read_10x_h5(f'{spaceranger_dir}/filtered_feature_bc_matrix.h5', gex_only=False) + else: + adata = sc.read_h5ad(f'{spaceranger_dir}/filtered_feature_bc_matrix.h5ad') + adata = adata[df_sampled.barcode.values, :] + # write the subsampled h5ad file + # mkdir + outputdir = Path(outputdir) + outputdir.mkdir(parents=True, exist_ok=True) + adata.write(outputdir / "filtered_feature_bc_matrix.h5ad") + # write the subsampled tissue_positions.csv file + outputdir2 = outputdir / 'spatial' + outputdir2.mkdir(parents=True, exist_ok=True) + df_sampled.to_csv(outputdir2 / "tissue_positions.csv", sep=",", index=False, header=True) + + return df_sampled + + +def subsample_snps(snp_dir, output_snp_dir, df_sampled, n_snps, random_seed=0): + """ + Sub-sample SNPs from a list of spaceranger output directories. + + Parameters + ---------- + snp_dir : str + Path to the directory containing the SNP files. + + output_snp_dir : str + Directory to save the subsampled SNP files. + + df_sampled : pd.DataFrame + Dataframe containing the subsampled barcodes. + """ + snp_dir = Path(snp_dir) + cell_snp_Aallele = scipy.sparse.load_npz(snp_dir / "cell_snp_Aallele.npz") + cell_snp_Ballele = scipy.sparse.load_npz(snp_dir / "cell_snp_Ballele.npz") + barcodes = np.loadtxt(snp_dir / "barcodes.txt", dtype=str) + unique_snp_ids = np.load(snp_dir / "unique_snp_ids.npy", allow_pickle=True) + + # first get the indices of the barcodes to keep. The order of barcodes in df_sampled should be retained + map_barcodes_index = {x:i for i, x in enumerate(barcodes)} + idx = np.array([map_barcodes_index[x] for x in df_sampled.barcode.values if x in map_barcodes_index]) + cell_snp_Aallele = cell_snp_Aallele[idx, :] + cell_snp_Ballele = cell_snp_Ballele[idx, :] + barcodes = barcodes[idx] + + # subsample SNPs + np.random.seed(random_seed) + idx = np.random.choice(cell_snp_Aallele.shape[1], n_snps, replace=False) + idx = np.sort(idx) + cell_snp_Aallele = cell_snp_Aallele[:, idx] + cell_snp_Ballele = cell_snp_Ballele[:, idx] + unique_snp_ids = unique_snp_ids[idx] + + # write the subsampled SNP files + output_snp_dir = Path(output_snp_dir) + output_snp_dir.mkdir(parents=True, exist_ok=True) + scipy.sparse.save_npz(output_snp_dir / "cell_snp_Aallele.npz", cell_snp_Aallele) + scipy.sparse.save_npz(output_snp_dir / "cell_snp_Ballele.npz", cell_snp_Ballele) + np.savetxt(output_snp_dir / "barcodes.txt", barcodes, fmt='%s') + np.save(output_snp_dir / "unique_snp_ids.npy", unique_snp_ids) + + return + + +def main(args): + # subsample cells: call either subsample_cells_multi or subsample_cells_single based on whether the input_filelist or spaceranger_dir is provided + if args.input_filelist is not None: + df_sampled = subsample_cells_multi(args.input_filelist, args.output_gex_dir, args.n_cells) + else: + df_sampled = subsample_cells_single(args.spaceranger_dir, args.output_gex_dir, args.n_cells) + + # subsample SNPs + subsample_snps(args.snp_dir, args.output_snp_dir, df_sampled, args.n_snps) + + return + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Subsample cells and SNPs from a list of spaceranger output directories.') + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--input_filelist', type=str, help='Path to a file containing a list of spaceranger output directories. Columns: bam, sample_id, spaceranger_dir') + group.add_argument('--spaceranger_dir', type=str, help='Path to the spaceranger output directory.') + + parser.add_argument('--snp_dir', type=str, help='Path to the directory containing the SNP files.') + parser.add_argument('--output_gex_dir', type=str, help='Directory to save the subsampled h5ad files.') + parser.add_argument('--output_snp_dir', type=str, help='Directory to save the subsampled SNP files.') + parser.add_argument('--n_cells', type=int, help='Number of cells to subsample.') + parser.add_argument('--n_snps', type=int, help='Number of SNPs to subsample.') + + args = parser.parse_args() + + main(args)