Added smcsmc to the analysis pipeline.

Chris1221 · Chris1221 · commit eb72f0dbd829 · 2019-08-01T23:33:39.000+01:00
diff --git a/README.md b/README.md
@@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make
 cd ../../
 ```
 
+`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux. 
+
+```sh
+conda install -c luntergroup smcsmc 
+```
+
 Further instructions can be currently found in each task directory
diff --git a/n_t/Snakefile b/n_t/Snakefile
@@ -24,13 +24,14 @@ import smc
 import msmc
 import simulations
 import plots
+import smcsmc
+import smcsmc.popsim
 
 
 # ###############################################################################
 # KNOBS - 
 # ###############################################################################
 
-
 # A seed to replicate results
 # TODO mutation rates
 
@@ -49,11 +50,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"]
 # Here is a list of sample sizes to run msmc on. 
 # Each element counts as its own analysis
 # so there will be "replicates" runs for each size
+#
+# and a similar setup for SMCSMC
 num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split(",")]
+num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split(",")]
 
 # The number of msmc Baumwelch(?) iterations to run,
 # typically 20
+#
+# And again, similar for SMCSMC. Number of stochastic EM iterations.
+# 15 is typical, but more is good too. Assess for convergence based on 
+# rainbow plots.
 num_msmc_iterations = config["num_msmc_iterations"]
+num_smcsmc_iterations = config["num_smcsmc_iterations"]
+
+# Number of particles to use for SMCSMC
+# 
+# A good starting point is 50k, and see if reducing 
+# significantly impacts the estimates that you are
+# recieveing. 
+num_smcsmc_particles = config["num_smcsmc_particles"]
 
 # The number of replicates of each analysis you would like to run
 replicates = config["replicates"]
@@ -299,6 +315,72 @@ rule compound_msmc:
     run: plots.plot_compound_msmc(model, input, output[0])
 
 
+# ###############################################################################
+#  SMCSMC
+# ###############################################################################
+
+rule ts_to_seg:
+	input: rules.simulation.output
+	output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg"
+	run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc)
+
+rule run_smcsmc:
+	input:	
+		expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
+        		chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc)
+	output: 
+		output_dir + "/Intermediate/{seeds}/{samps}.run/result.out"
+	run: 
+		inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg", 
+			seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list)
+		
+		input_file_string = " ".join(inputs)
+		args = {	
+			'EM': str(num_smcsmc_iterations),
+			'Np': str(num_smcsmc_particles),	
+			# Submission Parameters
+			'chunks': '100',	
+			'no_infer_recomb': '',
+			# Other inference parameters
+			'mu': str(species.genome.mean_mutation_rate),
+			'N0': '14312',
+			'rho': '3e-9',
+			'calibrate_lag': '1.0',
+			'tmax':	'3.5',
+			'alpha': '0',
+			'apf': '2',
+			'P': '133 133016 31*1',
+			'VB': '',
+			'nsam':	str(wildcards.samps),		
+			# This should be in the conda bin
+			'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc')
+		}
+		args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run"
+		args['segs'] = input_file_string
+		
+		smcsmc.run_smcsmc(args)	
+
+rule convert_smcsmc:
+    input: rules.run_smcsmc.output
+    output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv"
+    run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations)
+
+
+def ne_files_smcsmc(wildcards):
+    return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv",
+            seeds=seed_array, samps=num_sampled_genomes_smcsmc)
+
+rule plot_by_sample:
+    input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array)
+    output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png"
+    run: 
+    	plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model)
+
+rule compound_smcsmc:
+    input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc)
+
+
+
 # ###############################################################################
 # 
 # ###############################################################################
@@ -309,10 +391,11 @@ rule all_plot:
         f1 = ne_files,
         f2 = ne_files_smcpp,
         f3 = ne_files_msmc,
+	f4 = ne_files_smcsmc,
     output:
         output_dir + "/Results/all_estimated_Ne.pdf"
-    run:
-        plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0],
+    run: 
+        plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0],
                     model=model, n_samp=num_sampled_genomes_per_replicate,
                     generation_time=generation_time, species=config["species"],
                     pop_id=population_id)
diff --git a/n_t/plots.py b/n_t/plots.py
@@ -8,6 +8,7 @@
 import os
 import matplotlib.patches as mpatches
 from matplotlib import pyplot as plt
+import stdpopsim
 import numpy as np
 # Force matplotlib to not use any Xwindows backend.
 matplotlib.use('Agg')
@@ -61,52 +62,99 @@ def plot_compound_msmc(infiles, outfile):
         ax.plot(nt['x'], nt['y'], c="red")
     f.savefig(outfile, bbox_inches='tight')
 
+def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None):         
+    f, ax = plt.subplots(figsize=(7, 7))
+    ax.set(xscale="log", yscale="log")
 
-def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
-                          model, n_samp, generation_time, species,
-                          pop_id=0, steps=None):
+    if model is not None: 
+        ddb = msprime.DemographyDebugger(**model.asdict())
+        if steps is None:
+            end_time = ddb.epochs[-2].end_time + 10000
+            steps = np.exp(np.linspace(1,np.log(end_time),31))
+        num_samples = [0 for _ in range(ddb.num_populations)]
+        num_samples[pop_id] = 20
+        coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
+            num_samples=num_samples, double_step_validation=False)
+        steps = steps * generation_time
+        ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre')
 
-    ddb = msprime.DemographyDebugger(**model.asdict())
-    if steps is None:
-        end_time = ddb.epochs[-2].end_time + 10000
-        steps = np.linspace(1, end_time, end_time+1)
-    num_samples = [0 for _ in range(ddb.num_populations)]
-    num_samples[pop_id] = n_samp
-    coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
-                                                   num_samples=num_samples,
-                                                   double_step_validation=False)
-    steps = steps * generation_time
-    num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
-    num_msmc = sorted([int(x) for x in num_msmc])
-    f, ax = plt.subplots(1, 2+len(num_msmc), sharex=True, sharey=True, figsize=(14, 7))
-    for infile in smcpp_infiles:
+
+    for infile in infiles:
         nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
-        line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
-    ax[0].plot(steps, 1/(2*coal_rate), c="black")
-    ax[0].set_title("smc++")
-    for infile in sp_infiles:
-        nt = pandas.read_csv(infile, sep="\t", skiprows=5)
-        line2, = ax[1].plot(nt['year'], nt['Ne_median'], alpha=0.8)
-    ax[1].plot(steps, 1/(2*coal_rate), c="black")
-    ax[1].set_title("stairwayplot")
-    for i, sample_size in enumerate(num_msmc):
-        for infile in msmc_infiles:
-            fn = os.path.basename(infile)
-            samp = fn.split(".")[0]
-            if(int(samp) == sample_size):
-                nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
-                line3, = ax[2+i].plot(nt['x'], nt['y'], alpha=0.8)
-        ax[2+i].plot(steps, 1/(2*coal_rate), c="black")
-        ax[2+i].set_title(f"msmc, ({sample_size} samples)")
-    plt.suptitle(f"{species}, population id {pop_id}", fontsize=16)
-    for i in range(2+len(num_msmc)):
-        ax[i].set(xscale="log", yscale="log")
-        ax[i].set_xlabel("time (years ago)")
-    red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
-    ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
-    ax[0].set_ylabel("population size")
-    f.savefig(outfile, bbox_inches='tight', alpha=0.8)
+        ax.step(nt['x'], nt['y'], c="red")
+
+    ax.set_ylim([1e3,1e6])
+    ax.set_xlabel('Years before present')
+    ax.set_ylabel('Effective population size')
+    h_string = "".join(nhaps)
+    ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)")
+
+    f.savefig(outfile, bbox_inches='tight')
+
 
+def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile,
+                             model, n_samp, generation_time, species,
+                             pop_id = 0, steps=None):
+ 
+     ddb = msprime.DemographyDebugger(**model.asdict())
+     if steps is None:
+         end_time = ddb.epochs[-2].end_time + 10000
+         steps = np.linspace(1,end_time,end_time+1)
+     num_samples = [0 for _ in range(ddb.num_populations)]
+     num_samples[pop_id] = n_samp
+     coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
+         num_samples=num_samples, double_step_validation=False)
+     steps = steps * generation_time
+ 
+     num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
+     num_smcsmc = set([infile.split("/")[-2].split(".")[0] for infile in smcsmc_infiles])
+ 
+     num_msmc = sorted([int(x) for x in num_msmc])
+     num_smcsmc = sorted([int(x) for x in num_smcsmc])
+ 
+     f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7))
+     for infile in smcpp_infiles:
+         nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
+         line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
+     ax[0].plot(steps, 1/(2*coal_rate), c="black")
+     ax[0].set_title("smc++")
+     for infile in sp_infiles:
+         nt = pandas.read_csv(infile, sep="\t", skiprows=5)
+         line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8)
+     ax[1].plot(steps, 1/(2*coal_rate), c="black")
+     ax[1].set_title("stairwayplot")
+ 
+     plot_counter=2
+     for i,sample_size in enumerate(num_msmc):
+         for infile in msmc_infiles:
+             fn = os.path.basename(infile)
+             samp = fn.split(".")[0]
+             if(int(samp) == sample_size):
+                 nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
+                 line3, = ax[plot_counter].plot(nt['x'], nt['y'],alpha=0.8)
+         ax[plot_counter].plot(steps, 1/(2*coal_rate), c="black")
+         ax[plot_counter].set_title(f"msmc, ({sample_size} samples)")
+         plot_counter+=1
+ 
+     for i,sample_size in enumerate(num_smcsmc):
+         for infile in smcsmc_infiles:
+             samp = infile.split("/")[-2].split(".")[0]
+             if(int(samp) == sample_size):
+                 nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
+                 line3, = ax[plot_counter].plot(nt['x'], nt['y'],alpha=0.8)
+         ax[plot_counter].plot(steps, 1/(2*coal_rate), c="black")
+         ax[plot_counter].set_title(f"smcsmc, ({sample_size} samples)")
+         plot_counter+=1
+     plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16)
+     for i in range(2+len(num_msmc)+len(num_smcsmc)):
+         ax[i].set(xscale="log", yscale="log")
+         ax[i].set_xlabel("time (years ago)")
+ 
+ 
+     red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
+     ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
+     ax[0].set_ylabel("population size")
+     f.savefig(outfile, bbox_inches='tight', alpha=0.8)
 
 def plot_stairwayplot_coalrate(sp_infiles, outfile,
                                model, n_samp, generation_time, species,
diff --git a/n_t/readme.md b/n_t/readme.md
@@ -18,15 +18,16 @@ moment.
 
 ## Workflow
 
-The analysis includes three programs for predicting effective population 
+The analysis includes four programs for predicting effective population 
 size through time(`n_t`): 
 [msmc](https://github.com/stschiff/msmc/issues/23),
-[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and
+[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and
 [smc++](https://github.com/popgenmethods/smcpp).
-There are four target rules that can be executed with the given parameters: 
+There are five target rules that can be executed with the given parameters: 
 `compound_msmc`,
 `compound_smcpp`,
 `compound_stairwayplot`, 
+`compound_smcsmc`,
 or you can run all three on the same simulated data with rule `all`.
 
 To run an analysis, create a directory (wherever you want)
@@ -37,18 +38,21 @@ might look like this:
 
 ```json
 {
-    "seed" : 12345,
-    "population_id" : 0,
-    "num_sampled_genomes_per_replicate" : 20,  
-    "num_sampled_genomes_msmc" : "2,8",
-    "num_msmc_iterations" : 20,
-    "replicates" : 10,
-    "species" : "homo_sapiens",
-    "model" : "GutenkunstThreePopOutOfAfrica",
-    "genetic_map" : "HapmapII_GRCh37",
-    "chrm_list" : "chr22,chrX",
-    "mask_file" : "masks/HapmapII_GRCh37.mask.bed",
+	"seed" : 12345,
+	"population_id" : 0,
+	"num_sampled_genomes_per_replicate" : 20,
+	"num_sampled_genomes_msmc" : "2 8",
+	"num_sampled_genomes_smcsmc" : "4",
+	"num_smcsmc_particles": 10000,
+	"num_msmc_iterations" : 20,
+	"num_smcsmc_iterations": 15,
+	"replicates" : 1,
+	"species" : "homo_sapiens",
+	"model" : "GutenkunstThreePopOutOfAfrica",
+	"genetic_map" : "HapmapII_GRCh37",
+	"chrm_list" : "chr22,chrX"	
 }
+
 ```
 
 Once you have creates a directory which contains the config file