-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
executable file
·119 lines (91 loc) · 4.97 KB
/
main.py
File metadata and controls
executable file
·119 lines (91 loc) · 4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python3
"""
Provides analytical and empirical calculations of stochastic allele drift and summaries and plots of the quantities
"""
__author__ = "Caetano Souto-Maior"
__copyright__ = ""
__credits__ = ["Caetano Souto-Maior"]
__license__ = "GNU Affero General Public License (GNU AGPLv3)"
__version__ = "1.0"
__maintainer__ = "Caetano Souto-Maior"
__email__ = "caetanosoutomaior@protonmail.com"
__status__ = "Development"
import os, sys, platform
import numpy
import pandas
import warnings
import timeit
# loads custom functions for drift and likelihood computations
from okazaki_drift.okazaki_drift import recursive_binomial_array
from okazaki_drift.data_computations import likelihood_of_data
from okazaki_drift.iox import read_json_file, write_json_file
# loads and prints arguments provided to the script in the right order:
if ( len(sys.argv) == 8 ):
Ne = int(sys.argv[1])
generations = int(sys.argv[2])
maf0 = str(sys.argv[3])
mafN = str(sys.argv[4])
variantsShiftData = sys.argv[5]
subsetsShiftData = sys.argv[6]
project_dir = sys.argv[7]
data_dir = sys.argv[7]
results_dir = sys.argv[7]
elif ( len(sys.argv) == 10 ):
Ne = int(sys.argv[1])
generations = int(sys.argv[2])
maf0 = str(sys.argv[3])
mafN = str(sys.argv[4])
variantsShiftData = sys.argv[5]
subsetsShiftData = sys.argv[6]
project_dir = sys.argv[7]
data_dir = sys.argv[8]
results_dir = sys.argv[9]
else:
Ne = int(sys.argv[1])
generations = int(sys.argv[2])
maf0 = str(sys.argv[3])
mafN = str(sys.argv[4])
variantsShiftData = sys.argv[5]
subsetsShiftData = sys.argv[6]
project_dir = os.getcwd()
data_dir = os.path.join( project_dir, "data")
results_dir = os.path.join( project_dir, "results")
print("file:", sys.argv[0])
print("Variants frequency change data:", variantsShiftData )
print("Subset frequency change data:", subsetsShiftData)
print("Project dir:", project_dir)
print("Data dir:", data_dir)
print("Results dir:", results_dir)
print("Ne:", Ne)
print("# of generations:", generations)
# Defines frequencies from 0 to 1 in 0.01 increments
freqrange100 = numpy.arange(0, 1.01, 0.01)
# tries to load previously computed recursive binomial distributions from data folder, if they are not found, computes them from scratch using custom function "recursive_binomial_array"
try:
dictG = read_json_file( os.path.join( results_dir, "binomial_array" + str(generations) + ".json"), printFlag=False )
dictX = read_json_file( os.path.join( results_dir, "binomial_arrayX" + str(generations) + ".json"), printFlag=False)
darray = numpy.array( dictG['recursive_binomial_array'] )
xarray = numpy.array( dictX['recursive_binomial_array'] )
except:
tic = timeit.default_timer()
darray = recursive_binomial_array(p0list=freqrange100, Ne=Ne, generations=generations)
tac = timeit.default_timer()
print(">>> elapsed time:", tac-tic)
dump = write_json_file( os.path.join( results_dir, "binomial_array" + str(generations) + ".json"), {"recursive_binomial_array": darray.tolist(), "2N": 2*Ne}, idnt=None)
tic = timeit.default_timer()
xarray = recursive_binomial_array(p0list=freqrange100, Ne=Ne, generations=generations, X=True)
tac = timeit.default_timer()
print(">>> elapsed time (X):", tac-tic)
dump = write_json_file( os.path.join( results_dir, "binomial_arrayX" + str(generations) + ".json"), {"recursive_binomial_array": xarray.tolist(), "1.5N": 1.5*Ne}, idnt=None)
# from array with all distributions for every generation, a data frame with all distributions for the last generations only (starting with every frequency in the 0.01-increment list above)
recursiveBinomialG = pandas.DataFrame( darray[:,-1,:], index=numpy.round(numpy.arange(0, 1.01, 0.01),2), columns=range(darray.shape[-1]) )
recursiveBinomialX = pandas.DataFrame( xarray[:,-1,:], index=numpy.round(numpy.arange(0, 1.01, 0.01),2), columns=range(xarray.shape[-1]) )
# loads variant data and subset data (the latter must be split and have the difference in frequency betewen initial and final generations)
variantdata = pandas.read_csv( os.path.join( data_dir, variantsShiftData ), sep=',', header=0)
subdata = pandas.read_csv( os.path.join( results_dir, subsetsShiftData), sep=',', header=0)
# computes the likelihood of observing a frequency given the initial frequency using custom function "likelihood_of_data"
variantdata['likelihood'] = likelihood_of_data( variantdata, recursiveBinomialG, recursiveBinomialX, (maf0, mafN) )
subdata['likelihood'] = likelihood_of_data( subdata, recursiveBinomialG, recursiveBinomialX, (maf0, mafN) )
# saves new file to <results_dir> folder
variantdata.to_csv( os.path.join( project_dir, "results", variantsShiftData[:-4] + "_likelihoods.csv" ), sep=',', header=True, index=0 )
subdata.to_csv( os.path.join( project_dir, "results", subsetsShiftData[:-4] + "_likelihoods.csv" ), sep=',', header=True, index=0 )