Skip to content
Open
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Replace the following paths `config.yaml`:

Then you can run preprocessing pipeline by
```
snakemake --cores <number threads> --configfile config.yaml --snakefile calicost.smk all
snakemake --cores <number threads> --configfile config.yaml --snakefile calicost.smk --keep-incomplete all
```

### Inferring tumor purity per spot (optional)
Expand Down Expand Up @@ -229,4 +229,4 @@ The CalicoST manuscript is available on bioRxiv. If you use CalicoST for your wo
year={2024},
publisher={Cold Spring Harbor Laboratory}
}
```
```
24 changes: 21 additions & 3 deletions calicost.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import scipy
from pathlib import Path
import calicost.arg_parse
import calicost.parse_input

Expand All @@ -23,29 +24,46 @@ rule link_or_merge_bam:
"{outputdir}/logs/link_or_merge_bam.log"
run:
if "bamlist" in config:
# check all file exists:
df_entries = pd.read_csv(config["bamlist"], sep='\t', index_col=None, header=None)
for i in range(df_entries.shape[0]):
assert Path(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists() or Path(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv").exists(), f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv(.gz) doesn't exist!"
assert Path(f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam").exists(), f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam doesn't exist!"
assert Path(f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam.bai").exists(), f"{df_entries.iloc[i,2]}/possorted_genome_bam.bam.bai doesn't exist!"

# merged BAM file
shell(f"python {config['calicost_dir']}/utils/merge_bamfile.py -b {config['bamlist']} -o {params.outputdir}/ >> {log} 2>&1")
shell(f"samtools sort -m {params.samtools_sorting_mem} -o {output.bam} {params.outputdir}/unsorted_possorted_genome_bam.bam >> {log} 2>&1")
shell(f"samtools index {output.bam}")
shell(f"rm -fr {params.outputdir}/unsorted_possorted_genome_bam.bam")

# merged barcodes
df_entries = pd.read_csv(config["bamlist"], sep='\t', index_col=None, header=None)
df_barcodes = []
for i in range(df_entries.shape[0]):
tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None, index_col=None)
if Path(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists():
tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None, index_col=None)
else:
tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv", header=None, index_col=None)
tmpdf.iloc[:,0] = [f"{x}_{df_entries.iloc[i,1]}" for x in tmpdf.iloc[:,0]]
df_barcodes.append( tmpdf )
df_barcodes = pd.concat(df_barcodes, ignore_index=True)
df_barcodes.to_csv(f"{output.barcodefile}", sep='\t', index=False, header=False)
else:
# check all file exists:
assert Path(f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists() or Path(f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv").exists(), f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv(.gz) doesn't exist!"
assert Path(f"{config['spaceranger_dir']}/possorted_genome_bam.bam").exists(), f"{config['spaceranger_dir']}/possorted_genome_bam.bam doesn't exist!"
assert Path(f"{config['spaceranger_dir']}/possorted_genome_bam.bam.bai").exists(), f"{config['spaceranger_dir']}/possorted_genome_bam.bam.bai doesn't exist!"

# BAM file
assert "spaceranger_dir" in config
print("softlink of possorted_genome_bam.bam")
shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam {output.bam}")
shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam.bai {output.bai}")
# barcodes
shell(f"gunzip -c {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz > {output.barcodefile}")
if Path(f"{config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz").exists():
shell(f"gunzip -c {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz > {output.barcodefile}")
else:
shell(f"cp {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv > {output.barcodefile}")



Expand Down
17 changes: 16 additions & 1 deletion src/calicost/arg_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@ def load_default_config():
"supervision_clone_file" : None,
"filtergenelist_file" : None,
"filterregion_file" : None,
"htblock_min_snps" : 1,
"initial_min_umi" : 15,
"secondary_min_umi" : 300,
"min_snpumi_perspot" : 50,
'min_percent_expressed_spots' : 0.005,
"bafonly" : False,
# prephasing
"PREPHASING_select_tumor" : False,
"PREPHASING_spot_cluster_file" : None,
"PREPHASING_num_bins" : 3000,
"PREPHASING_min_tumor_spots" : 100,
# phase switch probability
"nu" : 1.0,
"logphase_shift" : -2.0,
Expand Down Expand Up @@ -86,10 +93,17 @@ def load_default_config():
"supervision_clone_file" : "str",
"filtergenelist_file" : "str",
"filterregion_file" : "str",
"htblock_min_snps" : "int",
"initial_min_umi" : "int",
"secondary_min_umi" : "int",
"min_snpumi_perspot" : "int",
'min_percent_expressed_spots' : "float",
"bafonly" : "bool",
# prephasing
"PREPHASING_select_tumor" : "bool",
"PREPHASING_spot_cluster_file" : "str",
"PREPHASING_num_bins" : "int",
"PREPHASING_min_tumor_spots" : "int",
# phase switch probability
"nu" : "float",
"logphase_shift" : "float",
Expand Down Expand Up @@ -130,7 +144,8 @@ def load_default_config():

category_names = ["", "# supporting files and preprocessing arguments", "# phase switch probability", "# HMRF configurations", "# HMM configurations", "# integer copy number"]
category_elements = [["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], \
["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \
["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "htblock_min_snps", "initial_min_umi", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \
["PREPHASING_select_tumor", "PREPHASING_spot_cluster_file", "PREPHASING_num_bins", "PREPHASING_min_tumor_spots"], \
["nu", "logphase_shift", "npart_phasing"], \
["n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold", "max_iter_outer", "nodepotential", "initialization_method", "num_hmrf_initialization_start", "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", "construct_adjacency_w"], \
["n_states", "params", "t", "t_phaseing", "fix_NB_dispersion", "shared_NB_dispersion", "fix_BB_dispersion", "shared_BB_dispersion", "max_iter", "tol", "gmm_random_state", "np_threshold", "np_eventminlen"], \
Expand Down
4 changes: 2 additions & 2 deletions src/calicost/calicost_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def main(configuration_file):
break
# filter bins based on normal
index_normal = np.where(normal_candidate)[0]
lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file'])
# lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
# single_X, single_base_nb_mean, single_total_bb_RD, config['nu'], config['logphase_shift'], index_normal, config['geneticmap_file'])
assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]
df_bininfo = genesnp_to_bininfo(df_gene_snp)
copy_single_X_rdr = copy.copy(single_X[:,0,:])
Expand Down
2 changes: 1 addition & 1 deletion src/calicost/hmm_NB_BB_phaseswitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ def similarity_components_rdrbaf_neymanpearson(X, base_nb_mean, total_bb_RD, res
new_assignment = np.array([map_clone_id[x] for x in res["new_assignment"]])
merged_res = copy.copy(res)
merged_res["new_assignment"] = new_assignment
merged_res["total_llf"] = np.NAN
merged_res["total_llf"] = np.nan
merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
return merging_groups, merged_res
Expand Down
2 changes: 1 addition & 1 deletion src/calicost/hmrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def merge_by_minspots(assignment, res, single_total_bb_RD, min_spots_thresholds=
# unique_assignment = np.unique(new_assignment)
merged_res = copy.copy(res)
merged_res["new_assignment"] = new_assignment
merged_res["total_llf"] = np.NAN
merged_res["total_llf"] = np.nan
merged_res["pred_cnv"] = np.concatenate([ res["pred_cnv"][(c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
merged_res["log_gamma"] = np.hstack([ res["log_gamma"][:, (c[0]*n_obs):(c[0]*n_obs+n_obs)] for c in merging_groups ])
return merging_groups, merged_res
Expand Down
Loading