Skip to content

Commit e9a11c5

Browse files
committed
Added smcsmc to the analysis pipeline.
1 parent b61ce4c commit e9a11c5

File tree

5 files changed

+174
-20
lines changed

5 files changed

+174
-20
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make
3535
cd ../../
3636
```
3737
38+
`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux.
39+
40+
```sh
41+
conda install -c luntergroup smcsmc
42+
```
43+
3844
Further instructions can be currently found in each task directory

n_t/Snakefile

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,14 @@ import smc
2323
import msmc
2424
import simulations
2525
import plots
26+
import smcsmc
27+
import smcsmc.popsim
2628

2729

2830
# ###############################################################################
2931
# KNOBS -
3032
# ###############################################################################
3133

32-
3334
# A seed to replicate results
3435
# TODO mutation rates
3536

@@ -48,11 +49,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"]
4849
# Here is a list of sample sizes to run msmc on.
4950
# Each element counts as its own analysis
5051
# so there will be "replicates" runs for each size
52+
#
53+
# and a similar setup for SMCSMC
5154
num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split()]
55+
num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split()]
5256

5357
# The number of msmc Baumwelch(?) iterations to run,
5458
# typically 20
59+
#
60+
# And again, similar for SMCSMC. Number of stochastic EM iterations.
61+
# 15 is typical, but more is good too. Assess for convergence based on
62+
# rainbow plots.
5563
num_msmc_iterations = config["num_msmc_iterations"]
64+
num_smcsmc_iterations = config["num_smcsmc_iterations"]
65+
66+
# Number of particles to use for SMCSMC
67+
#
68+
# A good starting point is 50k, and see if reducing
69+
# significantly impacts the estimates that you are
70+
# recieveing.
71+
num_smcsmc_particles = config["num_smcsmc_particles"]
5672

5773
# The number of replicates of each analysis you would like to run
5874
replicates = config["replicates"]
@@ -92,6 +108,7 @@ mutation_rate = species.genome.mean_mutation_rate
92108

93109

94110
seed_array = np.random.random_integers(1,2**31,replicates)
111+
#seed_array=np.array([1675701734])
95112
genetic_map_downloaded_flag= ".genetic_map_downloaded"
96113
msmc_exec = "../extern/msmc/build/msmc"
97114
stairwayplot_code = "stairwayplot/swarmops.jar"
@@ -273,6 +290,73 @@ rule compound_msmc:
273290
run: plots.plot_compound_msmc(model, input, output[0])
274291

275292

293+
# ###############################################################################
294+
# SMCSMC
295+
# ###############################################################################
296+
297+
rule ts_to_seg:
298+
input: rules.simulation.output
299+
output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg"
300+
run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc)
301+
302+
rule run_smcsmc:
303+
input:
304+
expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
305+
chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc)
306+
output:
307+
output_dir + "/Intermediate/{seeds}/{samps}.run/result.out"
308+
run:
309+
inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
310+
seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list)
311+
312+
input_file_string = " ".join(inputs)
313+
args = {
314+
'EM': str(num_smcsmc_iterations),
315+
'Np': str(num_smcsmc_particles),
316+
# Submission Parameters
317+
'chunks': '100',
318+
'c': '',
319+
'no_infer_recomb': '',
320+
# Other inference parameters
321+
'mu': str(species.genome.mean_mutation_rate),
322+
'N0': '14312',
323+
'rho': '3e-9',
324+
'calibrate_lag': '1.0',
325+
'tmax': '3.5',
326+
'alpha': '0',
327+
'apf': '2',
328+
'P': '133 133016 31*1',
329+
'VB': '',
330+
'nsam': str(wildcards.samps),
331+
# This should be in the conda bin
332+
'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc')
333+
}
334+
args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run"
335+
args['segs'] = input_file_string
336+
337+
smcsmc.run_smcsmc(args)
338+
339+
rule convert_smcsmc:
340+
input: rules.run_smcsmc.output
341+
output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv"
342+
run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations)
343+
344+
345+
def ne_files_smcsmc(wildcards):
346+
return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv",
347+
seeds=seed_array)
348+
349+
rule plot_by_sample:
350+
input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array)
351+
output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png"
352+
run:
353+
plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model)
354+
355+
rule compound_smcsmc:
356+
input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc)
357+
358+
359+
276360
# ###############################################################################
277361
#
278362
# ###############################################################################
@@ -283,10 +367,11 @@ rule all_plot:
283367
f1 = ne_files,
284368
f2 = ne_files_smcpp,
285369
f3 = ne_files_msmc,
370+
f4 = ne_files_smcsmc,
286371
output:
287372
output_dir + "/Results/all_estimated_Ne.pdf"
288-
run:
289-
plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0],
373+
run:
374+
plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0],
290375
model=model, n_samp=num_sampled_genomes_per_replicate,
291376
generation_time=generation_time, species=config["species"],
292377
pop_id=population_id)

n_t/plots.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import matplotlib.patches as mpatches
1010
from matplotlib import pyplot as plt
11+
import stdpopsim
1112
import numpy as np
1213
# Force matplotlib to not use any Xwindows backend.
1314
matplotlib.use('Agg')
@@ -59,8 +60,40 @@ def plot_compound_msmc(infiles, outfile):
5960
ax.plot(nt['x'], nt['y'], c="red")
6061
f.savefig(outfile, bbox_inches='tight')
6162

63+
def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None):
64+
f, ax = plt.subplots(figsize=(7, 7))
65+
ax.set(xscale="log", yscale="log")
66+
67+
if model == "ooa":
68+
model = getattr(stdpopsim.homo_sapiens,"GutenkunstThreePopOutOfAfrica")()
69+
70+
if model is not None:
71+
ddb = msprime.DemographyDebugger(**model.asdict())
72+
if steps is None:
73+
end_time = ddb.epochs[-2].end_time + 10000
74+
steps = np.exp(np.linspace(1,np.log(end_time),31))
75+
num_samples = [0 for _ in range(ddb.num_populations)]
76+
num_samples[pop_id] = 20
77+
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
78+
num_samples=num_samples, double_step_validation=False)
79+
steps = steps * generation_time
80+
ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre')
81+
82+
83+
for infile in infiles:
84+
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
85+
ax.step(nt['x'], nt['y'], c="red")
86+
87+
ax.set_ylim([1e3,1e6])
88+
ax.set_xlabel('Years before present')
89+
ax.set_ylabel('Effective population size')
90+
h_string = "".join(nhaps)
91+
ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)")
92+
93+
f.savefig(outfile, bbox_inches='tight')
6294

63-
def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
95+
96+
def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile,
6497
model, n_samp, generation_time, species,
6598
pop_id = 0, steps=None):
6699

@@ -73,9 +106,14 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
73106
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
74107
num_samples=num_samples, double_step_validation=False)
75108
steps = steps * generation_time
109+
76110
num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
111+
num_smcsmc = set([os.path.basename(infile).split(".")[0] for infile in smcsmc_infiles])
112+
77113
num_msmc = sorted([int(x) for x in num_msmc])
78-
f, ax = plt.subplots(1,2+len(num_msmc),sharex=True,sharey=True,figsize=(14, 7))
114+
num_smcsmc = sorted([int(x) for x in num_msmc])
115+
116+
f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7))
79117
for infile in smcpp_infiles:
80118
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
81119
line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
@@ -86,6 +124,7 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
86124
line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8)
87125
ax[1].plot(steps, 1/(2*coal_rate), c="black")
88126
ax[1].set_title("stairwayplot")
127+
89128
for i,sample_size in enumerate(num_msmc):
90129
for infile in msmc_infiles:
91130
fn = os.path.basename(infile)
@@ -99,6 +138,22 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
99138
for i in range(2+len(num_msmc)):
100139
ax[i].set(xscale="log", yscale="log")
101140
ax[i].set_xlabel("time (years ago)")
141+
142+
for i,sample_size in enumerate(num_smcsmc):
143+
for infile in smcsmc_infiles:
144+
fn = os.path.basename(infile)
145+
samp = fn.split(".")[0]
146+
if(int(samp) == sample_size):
147+
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
148+
line3, = ax[2+len(num_msmc) + i].plot(nt['x'], nt['y'],alpha=0.8)
149+
ax[2+i].plot(steps, 1/(2*coal_rate), c="black")
150+
ax[2+i].set_title(f"smcsmc, ({sample_size} samples)")
151+
plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16)
152+
for i in range(2+len(num_msmc)):
153+
ax[i].set(xscale="log", yscale="log")
154+
ax[i].set_xlabel("time (years ago)")
155+
156+
102157
red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
103158
ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
104159
ax[0].set_ylabel("population size")

n_t/readme.md

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@ moment.
1818

1919
## Workflow
2020

21-
The analysis includes three programs for predicting effective population
21+
The analysis includes four programs for predicting effective population
2222
size through time(`n_t`):
2323
[msmc](https://github.com/stschiff/msmc/issues/23),
24-
[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and
24+
[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and
2525
[smc++](https://github.com/popgenmethods/smcpp).
26-
There are four target rules that can be executed with the given parameters:
26+
There are five target rules that can be executed with the given parameters:
2727
`compound_msmc`,
2828
`compound_smcpp`,
2929
`compound_stairwayplot`,
30+
`compound_smcsmc`,
3031
or you can run all three on the same simulated data with rule `all`.
3132

3233
To run an analysis, create a directory (wherever you want)
@@ -37,18 +38,23 @@ might look like this:
3738

3839
```json
3940
{
40-
"seed" : 12345,
41-
"population_id" : 0,
42-
"num_sampled_genomes_per_replicate" : 20,
43-
"num_sampled_genomes_msmc" : "2 8",
44-
"num_msmc_iterations" : 20,
45-
"replicates" : 10,
46-
"species" : "homo_sapiens",
47-
"model" : "GutenkunstThreePopOutOfAfrica",
48-
"genetic_map" : "HapmapII_GRCh37",
49-
"chrm_list" : "chr22,chrX",
50-
"generation_time" : 25,
41+
"seed" : 12345,
42+
"population_id" : 0,
43+
"num_sampled_genomes_per_replicate" : 20,
44+
"num_sampled_genomes_msmc" : "2 8",
45+
"num_sampled_genomes_smcsmc" : "4",
46+
"num_smcsmc_particles": 10000,
47+
"num_msmc_iterations" : 20,
48+
"num_smcsmc_iterations": 15,
49+
"replicates" : 1,
50+
"species" : "homo_sapiens",
51+
"model" : "GutenkunstThreePopOutOfAfrica",
52+
"genetic_map" : "HapmapII_GRCh37",
53+
"chrm_list" : "all",
54+
"generation_time" : 30,
55+
"output_dir": "output"
5156
}
57+
5258
```
5359

5460
Once you have creates a directory which contains the config file

n_t/simulations.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import msprime
6+
import os
67
from stdpopsim import homo_sapiens
78

89

@@ -17,4 +18,5 @@ def simulate(out_path, species, model, genetic_map, seed, chrmStr,
1718
mutation_rate=chrom.default_mutation_rate,
1819
random_seed=seed,
1920
**model.asdict())
20-
ts.dump(out_path)
21+
ts.dump(out_path)
22+

0 commit comments

Comments
 (0)