Skip to content

Commit 2298136

Browse files
author
chester
committed
ADD MULTIPROCESSING
1 parent dfdb8e9 commit 2298136

8 files changed

+114
-48
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## [2.0.2] - 2019-09-18
6+
### Added
7+
- Add multiprocessing for gene batch running
8+
59
## [2.0.1] - 2019-07-12
610
### Added
711
- Support self-defiend genome regions (for any species)

genepi/GenEpi.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import time
1313
import os
1414
import sys
15+
import multiprocessing as mp
1516
import genepi
1617

1718
""""""""""""""""""""""""""""""
@@ -36,7 +37,7 @@ def ArgumentsParser():
3637
### define arguments for modeling
3738
parser.add_argument("-m", required=False, default="c", choices=["c", "r"], help="choose model type: c for classification; r for regression")
3839
parser.add_argument("-k", required=False, default=2, help="k of k-fold cross validation")
39-
parser.add_argument("-t", required=False, default=1, help="number of threads")
40+
parser.add_argument("-t", required=False, default=mp.cpu_count(), help="number of threads")
4041

4142
### define arguments for step1_downloadUCSCDB
4243
parser_group_1 = parser.add_argument_group("update UCSC database")
@@ -91,6 +92,10 @@ def main(args=None):
9192
str_outputFilePath = args.o
9293
else:
9394
str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
95+
int_thread = mp.cpu_count()
96+
if int(args.t) is not None:
97+
if int(args.t) < mp.cpu_count():
98+
int_thread = int(args.t)
9499

95100
if str_inputFileName_genotype == "example" and str_inputFileName_phenotype == "example":
96101
str_command = "cp " + os.path.join(os.path.dirname(genepi.__file__), "example", "sample.csv") + " " + str_outputFilePath
@@ -117,7 +122,7 @@ def main(args=None):
117122

118123
file_outputFile.writelines("\t" + "-m (model type): " + "Classification" if args.m=="c" else "Regression" + "\n")
119124
file_outputFile.writelines("\t" + "-k (k-fold cross validation): " + str(args.k) + "\n")
120-
file_outputFile.writelines("\t" + "-t (number of threads): " + str(args.t) + "\n" + "\n")
125+
file_outputFile.writelines("\t" + "-t (number of threads): " + str(int_thread) + "\n" + "\n")
121126

122127
file_outputFile.writelines("\t" + "--updatedb (enable function of update UCSC database): " + str(args.updatedb) + "\n")
123128
file_outputFile.writelines("\t" + "-b (human genome build): " + args.b + "\n" + "\n")
@@ -150,27 +155,27 @@ def main(args=None):
150155

151156
if args.m=="c":
152157
### step4_singleGeneEpistasis_Logistic (for case/control trial)
153-
genepi.BatchSingleGeneEpistasisLogistic(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
158+
genepi.BatchSingleGeneEpistasisLogistic(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
154159
### step5_crossGeneEpistasis_Logistic (for case/control trial)
155-
float_score_train, float_score_test = genepi.CrossGeneEpistasisLogistic(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
160+
float_score_train, float_score_test = genepi.CrossGeneEpistasisLogistic(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
156161
file_outputFile.writelines("Overall genetic feature performance (F1 score)" + "\n")
157162
file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
158163
file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
159164
### step6_ensembleWithCovariates (for case/control trial)
160-
float_score_train, float_score_test = genepi.EnsembleWithCovariatesClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
165+
float_score_train, float_score_test = genepi.EnsembleWithCovariatesClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
161166
file_outputFile.writelines("Ensemble with co-variate performance (F1 score)" + "\n")
162167
file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
163168
file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
164169
else:
165170
### step4_singleGeneEpistasis_Lasso (for quantitative trial)
166-
genepi.BatchSingleGeneEpistasisLasso(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
171+
genepi.BatchSingleGeneEpistasisLasso(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
167172
### step5_crossGeneEpistasis_Lasso (for quantitative trial)
168-
float_score_train, float_score_test = genepi.CrossGeneEpistasisLasso(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
173+
float_score_train, float_score_test = genepi.CrossGeneEpistasisLasso(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
169174
file_outputFile.writelines("Overall genetic feature performance (Average of the Pearson and Spearman correlation)" + "\n")
170175
file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
171176
file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
172177
### step6_ensembleWithCovariates (for quantitative trial)
173-
float_score_train, float_score_test = genepi.EnsembleWithCovariatesRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
178+
float_score_train, float_score_test = genepi.EnsembleWithCovariatesRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
174179
file_outputFile.writelines("Ensemble with co-variate performance (Average of the Pearson and Spearman correlation)" + "\n")
175180
file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
176181
file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")

genepi/step4_singleGeneEpistasis_Lasso.py

+37-12
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
# import libraries
1010
""""""""""""""""""""""""""""""
1111
import os
12+
import warnings
13+
warnings.filterwarnings('ignore')
14+
# ignore all warnings
15+
warnings.simplefilter("ignore")
16+
os.environ["PYTHONWARNINGS"] = "ignore"
17+
1218
import sys
1319
import itertools
1420
import numpy as np
@@ -21,14 +27,10 @@
2127
from sklearn.model_selection import KFold
2228
from sklearn.model_selection import GridSearchCV
2329
import scipy.stats as stats
30+
import multiprocessing as mp
2431

2532
from genepi.tools import randomized_l1
2633

27-
import warnings
28-
warnings.filterwarnings('ignore')
29-
# ignore all future warnings
30-
warnings.simplefilter(action='ignore', category=FutureWarning)
31-
3234
""""""""""""""""""""""""""""""
3335
# define functions
3436
""""""""""""""""""""""""""""""
@@ -42,7 +44,7 @@ def RandomizedLassoRegression(np_X, np_y):
4244

4345
return estimator.scores_
4446

45-
def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
47+
def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 1):
4648
X = np_X
4749
y = np_y
4850
X_sparse = coo_matrix(X)
@@ -56,8 +58,8 @@ def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
5658
alpha = np.logspace(-10, 10, 200)
5759
parameters = [{'alpha':alpha}]
5860
kf_estimator = KFold(n_splits=2)
59-
estimator_lasso = linear_model.Lasso()
60-
estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=int_nJobs, cv=kf_estimator)
61+
estimator_lasso = linear_model.Lasso(max_iter=1000)
62+
estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=1, cv=kf_estimator)
6163
estimator_grid.fit(X[idxTr], y[idxTr])
6264
list_label = estimator_grid.best_estimator_.predict(X[idxTe])
6365
list_weight.append([float(item) for item in estimator_grid.best_estimator_.coef_])
@@ -136,7 +138,7 @@ def FilterInLoading(np_genotype, np_phenotype):
136138
""""""""""""""""""""""""""""""
137139
# main function
138140
""""""""""""""""""""""""""""""
139-
def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):
141+
def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):
140142
### set path of output file
141143
if str_outputFilePath == "":
142144
str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
@@ -240,7 +242,7 @@ def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_pheno
240242

241243
return float_AVG_S_P
242244

243-
def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):
245+
def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = mp.cpu_count()):
244246
### set default output path
245247
if str_outputFilePath == "":
246248
str_outputFilePath = os.path.abspath(os.path.join(str_inputFilePath_genotype, os.pardir)) + "/singleGeneResult/"
@@ -254,7 +256,29 @@ def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_
254256
if ".gen" in str_fileName:
255257
list_genotypeFileName.append(str_fileName)
256258

257-
### batch PolyLogisticRegression
259+
### batch PolyLassoRegression
260+
### inital multiprocessing pool
261+
mp_pool = mp.Pool(int_nJobs)
262+
263+
### apply pool on the function that need be parallelizing
264+
dict_result = {}
265+
for int_count_gene, float_AVG_S_P in enumerate(mp_pool.starmap(SingleGeneEpistasisLasso, [(os.path.join(str_inputFilePath_genotype, gene), str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) for gene in list_genotypeFileName]), 0):
266+
if list_genotypeFileName[int_count_gene] not in dict_result:
267+
dict_result[list_genotypeFileName[int_count_gene]] = float_AVG_S_P
268+
str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + list_genotypeFileName[int_count_gene] + ": " + "\t\t"
269+
sys.stdout.write('%s\r' % str_print)
270+
sys.stdout.flush()
271+
272+
mp_pool.close()
273+
274+
### output result
275+
with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
276+
file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n")
277+
for key, value in dict_result.items():
278+
file_outputFile.writelines(key.split("_")[0] + "," + str(value) + "\n")
279+
280+
'''
281+
### batch PolyLassoRegression
258282
int_count_gene = 0
259283
with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
260284
file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n")
@@ -266,5 +290,6 @@ def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_
266290
str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + item + ": " + str(float_AVG_S_P) + "\t\t"
267291
sys.stdout.write('%s\r' % str_print)
268292
sys.stdout.flush()
269-
293+
'''
294+
270295
print("step4: Detect single gene epistasis. DONE! \t\t\t\t")

genepi/step4_singleGeneEpistasis_Logistic.py

+36-10
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@
88
""""""""""""""""""""""""""""""
99
# import libraries
1010
""""""""""""""""""""""""""""""
11+
import os
12+
import warnings
13+
warnings.filterwarnings('ignore')
14+
# ignore all warnings
15+
warnings.simplefilter("ignore")
16+
os.environ["PYTHONWARNINGS"] = "ignore"
17+
1118
import os
1219
import sys
1320
import itertools
@@ -22,14 +29,10 @@
2229
from sklearn.model_selection import GridSearchCV
2330
import sklearn.metrics as skMetric
2431
import scipy.stats as stats
32+
import multiprocessing as mp
2533

2634
from genepi.tools import randomized_l1
2735

28-
import warnings
29-
warnings.filterwarnings('ignore')
30-
# ignore all future warnings
31-
warnings.simplefilter(action='ignore', category=FutureWarning)
32-
3336
""""""""""""""""""""""""""""""
3437
# define functions
3538
""""""""""""""""""""""""""""""
@@ -43,7 +46,7 @@ def RandomizedLogisticRegression(np_X, np_y):
4346

4447
return estimator.scores_
4548

46-
def LogisticRegressionL1CV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
49+
def LogisticRegressionL1CV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 1):
4750
X = np_X
4851
y = np_y
4952
X_sparse = coo_matrix(X)
@@ -57,8 +60,8 @@ def LogisticRegressionL1CV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
5760
cost = [2**x for x in range(-8, 8)]
5861
parameters = [{'C':cost, 'penalty':['l1'], 'dual':[False], 'class_weight':['balanced']}]
5962
kf_estimator = KFold(n_splits=2)
60-
estimator_logistic = linear_model.LogisticRegression()
61-
estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=int_nJobs, cv=kf_estimator)
63+
estimator_logistic = linear_model.LogisticRegression(max_iter=100, solver='liblinear')
64+
estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=1, cv=kf_estimator)
6265
estimator_grid.fit(X[idxTr], y[idxTr])
6366
list_label = estimator_grid.best_estimator_.predict(X[idxTe])
6467
list_weight.append([float(item) for item in estimator_grid.best_estimator_.coef_[0]])
@@ -144,7 +147,7 @@ def FilterInLoading(np_genotype, np_phenotype):
144147
""""""""""""""""""""""""""""""
145148
# main function
146149
""""""""""""""""""""""""""""""
147-
def SingleGeneEpistasisLogistic(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):
150+
def SingleGeneEpistasisLogistic(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):
148151
### set path of output file
149152
if str_outputFilePath == "":
150153
str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
@@ -253,7 +256,7 @@ def SingleGeneEpistasisLogistic(str_inputFileName_genotype, str_inputFileName_ph
253256

254257
return float_f1Score
255258

256-
def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):
259+
def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = mp.cpu_count()):
257260
### set default output path
258261
if str_outputFilePath == "":
259262
str_outputFilePath = os.path.abspath(os.path.join(str_inputFilePath_genotype, os.pardir)) + "/singleGeneResult/"
@@ -266,7 +269,29 @@ def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileNa
266269
for str_fileName in os.listdir(str_inputFilePath_genotype):
267270
if ".gen" in str_fileName:
268271
list_genotypeFileName.append(str_fileName)
272+
273+
### batch PolyLogisticRegression
274+
### inital multiprocessing pool
275+
mp_pool = mp.Pool(int_nJobs)
269276

277+
### apply pool on the function that need be parallelizing
278+
dict_result = {}
279+
for int_count_gene, float_f1Score in enumerate(mp_pool.starmap(SingleGeneEpistasisLogistic, [(os.path.join(str_inputFilePath_genotype, gene), str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) for gene in list_genotypeFileName]), 0):
280+
if list_genotypeFileName[int_count_gene] not in dict_result:
281+
dict_result[list_genotypeFileName[int_count_gene]] = float_f1Score
282+
str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + list_genotypeFileName[int_count_gene] + ": " + "\t\t"
283+
sys.stdout.write('%s\r' % str_print)
284+
sys.stdout.flush()
285+
286+
mp_pool.close()
287+
288+
### output result
289+
with open(str_outputFilePath + "All_Logistic_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
290+
file_outputFile.writelines("GeneSymbol,F1Score" + "\n")
291+
for key, value in dict_result.items():
292+
file_outputFile.writelines(key.split("_")[0] + "," + str(value) + "\n")
293+
294+
'''
270295
### batch PolyLogisticRegression
271296
int_count_gene = 0
272297
with open(str_outputFilePath + "All_Logistic_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
@@ -279,5 +304,6 @@ def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileNa
279304
str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + item + ": " + str(float_f1Score) + "\t\t"
280305
sys.stdout.write('%s\r' % str_print)
281306
sys.stdout.flush()
307+
'''
282308

283309
print("step4: Detect single gene epistasis. DONE! \t\t\t\t")

genepi/step5_crossGeneEpistasis_Lasso.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@
88
""""""""""""""""""""""""""""""
99
# import libraries
1010
""""""""""""""""""""""""""""""
11+
import os
12+
import warnings
13+
warnings.filterwarnings('ignore')
14+
# ignore all warnings
15+
warnings.simplefilter("ignore")
16+
os.environ["PYTHONWARNINGS"] = "ignore"
17+
1118
import os
1219
import numpy as np
1320
np.seterr(divide='ignore', invalid='ignore')
@@ -21,11 +28,6 @@
2128
from genepi.step4_singleGeneEpistasis_Lasso import LassoRegressionCV
2229
from genepi.step4_singleGeneEpistasis_Lasso import FeatureEncoderLasso
2330

24-
import warnings
25-
warnings.filterwarnings('ignore')
26-
# ignore all future warnings
27-
warnings.simplefilter(action='ignore', category=FutureWarning)
28-
2931
""""""""""""""""""""""""""""""
3032
# define functions
3133
""""""""""""""""""""""""""""""
@@ -39,7 +41,7 @@ def LassoRegression(np_X, np_y, int_nJobs = 4):
3941
alpha = np.logspace(-10, 10, 200)
4042
parameters = [{'alpha':alpha}]
4143
kf_estimator = KFold(n_splits=2)
42-
estimator_lasso = linear_model.Lasso()
44+
estimator_lasso = linear_model.Lasso(max_iter=1000)
4345
estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=int_nJobs, cv=kf_estimator)
4446
estimator_grid.fit(X, y)
4547
list_label = estimator_grid.best_estimator_.predict(X)

genepi/step5_crossGeneEpistasis_Logistic.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@
88
""""""""""""""""""""""""""""""
99
# import libraries
1010
""""""""""""""""""""""""""""""
11+
import os
12+
import warnings
13+
warnings.filterwarnings('ignore')
14+
# ignore all warnings
15+
warnings.simplefilter("ignore")
16+
os.environ["PYTHONWARNINGS"] = "ignore"
17+
1118
import os
1219
import numpy as np
1320
np.seterr(divide='ignore', invalid='ignore')
@@ -22,11 +29,6 @@
2229
from genepi.step4_singleGeneEpistasis_Logistic import LogisticRegressionL1CV
2330
from genepi.step4_singleGeneEpistasis_Logistic import FeatureEncoderLogistic
2431

25-
import warnings
26-
warnings.filterwarnings('ignore')
27-
# ignore all future warnings
28-
warnings.simplefilter(action='ignore', category=FutureWarning)
29-
3032
""""""""""""""""""""""""""""""
3133
# define functions
3234
""""""""""""""""""""""""""""""
@@ -40,7 +42,7 @@ def LogisticRegressionL1(np_X, np_y, int_nJobs = 4):
4042
cost = [2**x for x in range(-8, 8)]
4143
parameters = [{'C':cost, 'penalty':['l1'], 'dual':[False], 'class_weight':['balanced']}]
4244
kf_estimator = KFold(n_splits=2)
43-
estimator_logistic = linear_model.LogisticRegression()
45+
estimator_logistic = linear_model.LogisticRegression(max_iter=100, solver='liblinear')
4446
estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=int_nJobs, cv=kf_estimator)
4547
estimator_grid.fit(X, y)
4648
list_label = estimator_grid.best_estimator_.predict(X)

0 commit comments

Comments
 (0)