Skip to content

Commit eb72f0d

Browse files
committed
Added smcsmc to the analysis pipeline.
1 parent 6077725 commit eb72f0d

File tree

4 files changed

+200
-59
lines changed

4 files changed

+200
-59
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make
3535
cd ../../
3636
```
3737
38+
`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux.
39+
40+
```sh
41+
conda install -c luntergroup smcsmc
42+
```
43+
3844
Further instructions can be currently found in each task directory

n_t/Snakefile

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@ import smc
2424
import msmc
2525
import simulations
2626
import plots
27+
import smcsmc
28+
import smcsmc.popsim
2729

2830

2931
# ###############################################################################
3032
# KNOBS -
3133
# ###############################################################################
3234

33-
3435
# A seed to replicate results
3536
# TODO mutation rates
3637

@@ -49,11 +50,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"]
4950
# Here is a list of sample sizes to run msmc on.
5051
# Each element counts as its own analysis
5152
# so there will be "replicates" runs for each size
53+
#
54+
# and a similar setup for SMCSMC
5255
num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split(",")]
56+
num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split(",")]
5357

5458
# The number of msmc Baumwelch(?) iterations to run,
5559
# typically 20
60+
#
61+
# And again, similar for SMCSMC. Number of stochastic EM iterations.
62+
# 15 is typical, but more is good too. Assess for convergence based on
63+
# rainbow plots.
5664
num_msmc_iterations = config["num_msmc_iterations"]
65+
num_smcsmc_iterations = config["num_smcsmc_iterations"]
66+
67+
# Number of particles to use for SMCSMC
68+
#
69+
# A good starting point is 50k, and see if reducing
70+
# significantly impacts the estimates that you are
71+
# recieveing.
72+
num_smcsmc_particles = config["num_smcsmc_particles"]
5773

5874
# The number of replicates of each analysis you would like to run
5975
replicates = config["replicates"]
@@ -299,6 +315,72 @@ rule compound_msmc:
299315
run: plots.plot_compound_msmc(model, input, output[0])
300316

301317

318+
# ###############################################################################
319+
# SMCSMC
320+
# ###############################################################################
321+
322+
rule ts_to_seg:
323+
input: rules.simulation.output
324+
output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg"
325+
run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc)
326+
327+
rule run_smcsmc:
328+
input:
329+
expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
330+
chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc)
331+
output:
332+
output_dir + "/Intermediate/{seeds}/{samps}.run/result.out"
333+
run:
334+
inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
335+
seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list)
336+
337+
input_file_string = " ".join(inputs)
338+
args = {
339+
'EM': str(num_smcsmc_iterations),
340+
'Np': str(num_smcsmc_particles),
341+
# Submission Parameters
342+
'chunks': '100',
343+
'no_infer_recomb': '',
344+
# Other inference parameters
345+
'mu': str(species.genome.mean_mutation_rate),
346+
'N0': '14312',
347+
'rho': '3e-9',
348+
'calibrate_lag': '1.0',
349+
'tmax': '3.5',
350+
'alpha': '0',
351+
'apf': '2',
352+
'P': '133 133016 31*1',
353+
'VB': '',
354+
'nsam': str(wildcards.samps),
355+
# This should be in the conda bin
356+
'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc')
357+
}
358+
args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run"
359+
args['segs'] = input_file_string
360+
361+
smcsmc.run_smcsmc(args)
362+
363+
rule convert_smcsmc:
364+
input: rules.run_smcsmc.output
365+
output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv"
366+
run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations)
367+
368+
369+
def ne_files_smcsmc(wildcards):
370+
return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv",
371+
seeds=seed_array, samps=num_sampled_genomes_smcsmc)
372+
373+
rule plot_by_sample:
374+
input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array)
375+
output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png"
376+
run:
377+
plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model)
378+
379+
rule compound_smcsmc:
380+
input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc)
381+
382+
383+
302384
# ###############################################################################
303385
#
304386
# ###############################################################################
@@ -309,10 +391,11 @@ rule all_plot:
309391
f1 = ne_files,
310392
f2 = ne_files_smcpp,
311393
f3 = ne_files_msmc,
394+
f4 = ne_files_smcsmc,
312395
output:
313396
output_dir + "/Results/all_estimated_Ne.pdf"
314-
run:
315-
plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0],
397+
run:
398+
plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0],
316399
model=model, n_samp=num_sampled_genomes_per_replicate,
317400
generation_time=generation_time, species=config["species"],
318401
pop_id=population_id)

n_t/plots.py

Lines changed: 90 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import matplotlib.patches as mpatches
1010
from matplotlib import pyplot as plt
11+
import stdpopsim
1112
import numpy as np
1213
# Force matplotlib to not use any Xwindows backend.
1314
matplotlib.use('Agg')
@@ -61,52 +62,99 @@ def plot_compound_msmc(infiles, outfile):
6162
ax.plot(nt['x'], nt['y'], c="red")
6263
f.savefig(outfile, bbox_inches='tight')
6364

65+
def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None):
66+
f, ax = plt.subplots(figsize=(7, 7))
67+
ax.set(xscale="log", yscale="log")
6468

65-
def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
66-
model, n_samp, generation_time, species,
67-
pop_id=0, steps=None):
69+
if model is not None:
70+
ddb = msprime.DemographyDebugger(**model.asdict())
71+
if steps is None:
72+
end_time = ddb.epochs[-2].end_time + 10000
73+
steps = np.exp(np.linspace(1,np.log(end_time),31))
74+
num_samples = [0 for _ in range(ddb.num_populations)]
75+
num_samples[pop_id] = 20
76+
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
77+
num_samples=num_samples, double_step_validation=False)
78+
steps = steps * generation_time
79+
ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre')
6880

69-
ddb = msprime.DemographyDebugger(**model.asdict())
70-
if steps is None:
71-
end_time = ddb.epochs[-2].end_time + 10000
72-
steps = np.linspace(1, end_time, end_time+1)
73-
num_samples = [0 for _ in range(ddb.num_populations)]
74-
num_samples[pop_id] = n_samp
75-
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
76-
num_samples=num_samples,
77-
double_step_validation=False)
78-
steps = steps * generation_time
79-
num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
80-
num_msmc = sorted([int(x) for x in num_msmc])
81-
f, ax = plt.subplots(1, 2+len(num_msmc), sharex=True, sharey=True, figsize=(14, 7))
82-
for infile in smcpp_infiles:
81+
82+
for infile in infiles:
8383
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
84-
line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
85-
ax[0].plot(steps, 1/(2*coal_rate), c="black")
86-
ax[0].set_title("smc++")
87-
for infile in sp_infiles:
88-
nt = pandas.read_csv(infile, sep="\t", skiprows=5)
89-
line2, = ax[1].plot(nt['year'], nt['Ne_median'], alpha=0.8)
90-
ax[1].plot(steps, 1/(2*coal_rate), c="black")
91-
ax[1].set_title("stairwayplot")
92-
for i, sample_size in enumerate(num_msmc):
93-
for infile in msmc_infiles:
94-
fn = os.path.basename(infile)
95-
samp = fn.split(".")[0]
96-
if(int(samp) == sample_size):
97-
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
98-
line3, = ax[2+i].plot(nt['x'], nt['y'], alpha=0.8)
99-
ax[2+i].plot(steps, 1/(2*coal_rate), c="black")
100-
ax[2+i].set_title(f"msmc, ({sample_size} samples)")
101-
plt.suptitle(f"{species}, population id {pop_id}", fontsize=16)
102-
for i in range(2+len(num_msmc)):
103-
ax[i].set(xscale="log", yscale="log")
104-
ax[i].set_xlabel("time (years ago)")
105-
red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
106-
ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
107-
ax[0].set_ylabel("population size")
108-
f.savefig(outfile, bbox_inches='tight', alpha=0.8)
84+
ax.step(nt['x'], nt['y'], c="red")
85+
86+
ax.set_ylim([1e3,1e6])
87+
ax.set_xlabel('Years before present')
88+
ax.set_ylabel('Effective population size')
89+
h_string = "".join(nhaps)
90+
ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)")
91+
92+
f.savefig(outfile, bbox_inches='tight')
93+
10994

95+
def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile,
96+
model, n_samp, generation_time, species,
97+
pop_id = 0, steps=None):
98+
99+
ddb = msprime.DemographyDebugger(**model.asdict())
100+
if steps is None:
101+
end_time = ddb.epochs[-2].end_time + 10000
102+
steps = np.linspace(1,end_time,end_time+1)
103+
num_samples = [0 for _ in range(ddb.num_populations)]
104+
num_samples[pop_id] = n_samp
105+
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
106+
num_samples=num_samples, double_step_validation=False)
107+
steps = steps * generation_time
108+
109+
num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
110+
num_smcsmc = set([infile.split("/")[-2].split(".")[0] for infile in smcsmc_infiles])
111+
112+
num_msmc = sorted([int(x) for x in num_msmc])
113+
num_smcsmc = sorted([int(x) for x in num_smcsmc])
114+
115+
f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7))
116+
for infile in smcpp_infiles:
117+
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
118+
line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
119+
ax[0].plot(steps, 1/(2*coal_rate), c="black")
120+
ax[0].set_title("smc++")
121+
for infile in sp_infiles:
122+
nt = pandas.read_csv(infile, sep="\t", skiprows=5)
123+
line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8)
124+
ax[1].plot(steps, 1/(2*coal_rate), c="black")
125+
ax[1].set_title("stairwayplot")
126+
127+
plot_counter=2
128+
for i,sample_size in enumerate(num_msmc):
129+
for infile in msmc_infiles:
130+
fn = os.path.basename(infile)
131+
samp = fn.split(".")[0]
132+
if(int(samp) == sample_size):
133+
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
134+
line3, = ax[plot_counter].plot(nt['x'], nt['y'],alpha=0.8)
135+
ax[plot_counter].plot(steps, 1/(2*coal_rate), c="black")
136+
ax[plot_counter].set_title(f"msmc, ({sample_size} samples)")
137+
plot_counter+=1
138+
139+
for i,sample_size in enumerate(num_smcsmc):
140+
for infile in smcsmc_infiles:
141+
samp = infile.split("/")[-2].split(".")[0]
142+
if(int(samp) == sample_size):
143+
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
144+
line3, = ax[plot_counter].plot(nt['x'], nt['y'],alpha=0.8)
145+
ax[plot_counter].plot(steps, 1/(2*coal_rate), c="black")
146+
ax[plot_counter].set_title(f"smcsmc, ({sample_size} samples)")
147+
plot_counter+=1
148+
plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16)
149+
for i in range(2+len(num_msmc)+len(num_smcsmc)):
150+
ax[i].set(xscale="log", yscale="log")
151+
ax[i].set_xlabel("time (years ago)")
152+
153+
154+
red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
155+
ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
156+
ax[0].set_ylabel("population size")
157+
f.savefig(outfile, bbox_inches='tight', alpha=0.8)
110158

111159
def plot_stairwayplot_coalrate(sp_infiles, outfile,
112160
model, n_samp, generation_time, species,

n_t/readme.md

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@ moment.
1818

1919
## Workflow
2020

21-
The analysis includes three programs for predicting effective population
21+
The analysis includes four programs for predicting effective population
2222
size through time(`n_t`):
2323
[msmc](https://github.com/stschiff/msmc/issues/23),
24-
[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and
24+
[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and
2525
[smc++](https://github.com/popgenmethods/smcpp).
26-
There are four target rules that can be executed with the given parameters:
26+
There are five target rules that can be executed with the given parameters:
2727
`compound_msmc`,
2828
`compound_smcpp`,
2929
`compound_stairwayplot`,
30+
`compound_smcsmc`,
3031
or you can run all three on the same simulated data with rule `all`.
3132

3233
To run an analysis, create a directory (wherever you want)
@@ -37,18 +38,21 @@ might look like this:
3738

3839
```json
3940
{
40-
"seed" : 12345,
41-
"population_id" : 0,
42-
"num_sampled_genomes_per_replicate" : 20,
43-
"num_sampled_genomes_msmc" : "2,8",
44-
"num_msmc_iterations" : 20,
45-
"replicates" : 10,
46-
"species" : "homo_sapiens",
47-
"model" : "GutenkunstThreePopOutOfAfrica",
48-
"genetic_map" : "HapmapII_GRCh37",
49-
"chrm_list" : "chr22,chrX",
50-
"mask_file" : "masks/HapmapII_GRCh37.mask.bed",
41+
"seed" : 12345,
42+
"population_id" : 0,
43+
"num_sampled_genomes_per_replicate" : 20,
44+
"num_sampled_genomes_msmc" : "2 8",
45+
"num_sampled_genomes_smcsmc" : "4",
46+
"num_smcsmc_particles": 10000,
47+
"num_msmc_iterations" : 20,
48+
"num_smcsmc_iterations": 15,
49+
"replicates" : 1,
50+
"species" : "homo_sapiens",
51+
"model" : "GutenkunstThreePopOutOfAfrica",
52+
"genetic_map" : "HapmapII_GRCh37",
53+
"chrm_list" : "chr22,chrX"
5154
}
55+
5256
```
5357

5458
Once you have creates a directory which contains the config file

0 commit comments

Comments
 (0)