ADD MULTIPROCESSING

chester · chester · commit 2298136aa1fb · 2019-09-18T12:01:55.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 All notable changes to this project will be documented in this file.
 
+## [2.0.2] - 2019-09-18
+### Added
+- Add multiprocessing for gene batch running
+
 ## [2.0.1] - 2019-07-12
 ### Added
 - Support self-defiend genome regions (for any species)
diff --git a/genepi/GenEpi.py b/genepi/GenEpi.py
@@ -12,6 +12,7 @@
 import time
 import os
 import sys
+import multiprocessing as mp
 import genepi
 
 """"""""""""""""""""""""""""""
@@ -36,7 +37,7 @@ def ArgumentsParser():
     ### define arguments for modeling
     parser.add_argument("-m", required=False, default="c", choices=["c", "r"], help="choose model type: c for classification; r for regression")
     parser.add_argument("-k", required=False, default=2, help="k of k-fold cross validation")
-    parser.add_argument("-t", required=False, default=1, help="number of threads")
+    parser.add_argument("-t", required=False, default=mp.cpu_count(), help="number of threads")
     
     ### define arguments for step1_downloadUCSCDB
     parser_group_1 = parser.add_argument_group("update UCSC database")
@@ -91,6 +92,10 @@ def main(args=None):
         str_outputFilePath = args.o
     else:
         str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
+    int_thread = mp.cpu_count()
+    if int(args.t) is not None:
+        if int(args.t) < mp.cpu_count():
+            int_thread = int(args.t)
         
     if str_inputFileName_genotype == "example" and str_inputFileName_phenotype == "example":
         str_command = "cp " + os.path.join(os.path.dirname(genepi.__file__), "example", "sample.csv") + " " + str_outputFilePath
@@ -117,7 +122,7 @@ def main(args=None):
         
         file_outputFile.writelines("\t" + "-m (model type): " + "Classification" if args.m=="c" else "Regression"  + "\n")
         file_outputFile.writelines("\t" + "-k (k-fold cross validation): " + str(args.k) + "\n")
-        file_outputFile.writelines("\t" + "-t (number of threads): " + str(args.t) + "\n" + "\n")
+        file_outputFile.writelines("\t" + "-t (number of threads): " + str(int_thread) + "\n" + "\n")
         
         file_outputFile.writelines("\t" + "--updatedb (enable function of update UCSC database): " + str(args.updatedb) + "\n")
         file_outputFile.writelines("\t" + "-b (human genome build): " + args.b + "\n" + "\n")
@@ -150,27 +155,27 @@ def main(args=None):
         
         if args.m=="c":
             ### step4_singleGeneEpistasis_Logistic (for case/control trial)
-            genepi.BatchSingleGeneEpistasisLogistic(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
+            genepi.BatchSingleGeneEpistasisLogistic(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
             ### step5_crossGeneEpistasis_Logistic (for case/control trial)
-            float_score_train, float_score_test = genepi.CrossGeneEpistasisLogistic(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
+            float_score_train, float_score_test = genepi.CrossGeneEpistasisLogistic(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
             file_outputFile.writelines("Overall genetic feature performance (F1 score)" + "\n")
             file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
             file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
             ### step6_ensembleWithCovariates (for case/control trial)
-            float_score_train, float_score_test = genepi.EnsembleWithCovariatesClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
+            float_score_train, float_score_test = genepi.EnsembleWithCovariatesClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
             file_outputFile.writelines("Ensemble with co-variate performance (F1 score)" + "\n")
             file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
             file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
         else:
             ### step4_singleGeneEpistasis_Lasso (for quantitative trial)
-            genepi.BatchSingleGeneEpistasisLasso(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
+            genepi.BatchSingleGeneEpistasisLasso(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
             ### step5_crossGeneEpistasis_Lasso (for quantitative trial)
-            float_score_train, float_score_test = genepi.CrossGeneEpistasisLasso(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
+            float_score_train, float_score_test = genepi.CrossGeneEpistasisLasso(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
             file_outputFile.writelines("Overall genetic feature performance (Average of the Pearson and Spearman correlation)" + "\n")
             file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
             file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
             ### step6_ensembleWithCovariates (for quantitative trial)
-            float_score_train, float_score_test = genepi.EnsembleWithCovariatesRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(args.t))
+            float_score_train, float_score_test = genepi.EnsembleWithCovariatesRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
             file_outputFile.writelines("Ensemble with co-variate performance (Average of the Pearson and Spearman correlation)" + "\n")
             file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
             file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
diff --git a/genepi/step4_singleGeneEpistasis_Lasso.py b/genepi/step4_singleGeneEpistasis_Lasso.py
@@ -9,6 +9,12 @@
 # import libraries
 """"""""""""""""""""""""""""""
 import os
+import warnings
+warnings.filterwarnings('ignore')
+# ignore all warnings
+warnings.simplefilter("ignore")
+os.environ["PYTHONWARNINGS"] = "ignore"
+
 import sys
 import itertools
 import numpy as np
@@ -21,14 +27,10 @@
 from sklearn.model_selection import KFold
 from sklearn.model_selection import GridSearchCV
 import scipy.stats as stats
+import multiprocessing as mp
 
 from genepi.tools import randomized_l1
 
-import warnings
-warnings.filterwarnings('ignore')
-# ignore all future warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
 """"""""""""""""""""""""""""""
 # define functions 
 """"""""""""""""""""""""""""""
@@ -42,7 +44,7 @@ def RandomizedLassoRegression(np_X, np_y):
     
     return estimator.scores_
 
-def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
+def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 1):
     X = np_X
     y = np_y
     X_sparse = coo_matrix(X)
@@ -56,8 +58,8 @@ def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
         alpha = np.logspace(-10, 10, 200)
         parameters = [{'alpha':alpha}]
         kf_estimator = KFold(n_splits=2)
-        estimator_lasso = linear_model.Lasso()
-        estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=int_nJobs, cv=kf_estimator)
+        estimator_lasso = linear_model.Lasso(max_iter=1000)
+        estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=1, cv=kf_estimator)
         estimator_grid.fit(X[idxTr], y[idxTr])
         list_label = estimator_grid.best_estimator_.predict(X[idxTe])
         list_weight.append([float(item) for item in estimator_grid.best_estimator_.coef_])
@@ -136,7 +138,7 @@ def FilterInLoading(np_genotype, np_phenotype):
 """"""""""""""""""""""""""""""
 # main function
 """"""""""""""""""""""""""""""
-def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):    
+def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):    
     ### set path of output file
     if str_outputFilePath == "":
         str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
@@ -240,7 +242,7 @@ def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_pheno
     
     return float_AVG_S_P
 
-def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):
+def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = mp.cpu_count()):
     ### set default output path
     if str_outputFilePath == "":
         str_outputFilePath = os.path.abspath(os.path.join(str_inputFilePath_genotype, os.pardir)) + "/singleGeneResult/"
@@ -254,7 +256,29 @@ def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_
         if ".gen" in str_fileName:
             list_genotypeFileName.append(str_fileName)
     
-    ### batch PolyLogisticRegression
+    ### batch PolyLassoRegression
+    ### inital multiprocessing pool
+    mp_pool = mp.Pool(int_nJobs)
+
+    ### apply pool on the function that need be parallelizing
+    dict_result = {}
+    for int_count_gene, float_AVG_S_P in enumerate(mp_pool.starmap(SingleGeneEpistasisLasso, [(os.path.join(str_inputFilePath_genotype, gene), str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) for gene in list_genotypeFileName]), 0):
+        if list_genotypeFileName[int_count_gene] not in dict_result:
+            dict_result[list_genotypeFileName[int_count_gene]] = float_AVG_S_P
+        str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + list_genotypeFileName[int_count_gene] + ": " + "\t\t"
+        sys.stdout.write('%s\r' % str_print)
+        sys.stdout.flush()
+
+    mp_pool.close()
+
+    ### output result
+    with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
+        file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n")
+        for key, value in dict_result.items():
+            file_outputFile.writelines(key.split("_")[0] + "," + str(value) + "\n")
+
+    '''
+    ### batch PolyLassoRegression
     int_count_gene = 0
     with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
         file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n")
@@ -266,5 +290,6 @@ def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_
             str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + item + ": " + str(float_AVG_S_P) + "\t\t"
             sys.stdout.write('%s\r' % str_print)
             sys.stdout.flush()
-    
+    '''
+
     print("step4: Detect single gene epistasis. DONE! \t\t\t\t")
diff --git a/genepi/step4_singleGeneEpistasis_Logistic.py b/genepi/step4_singleGeneEpistasis_Logistic.py
@@ -8,6 +8,13 @@
 """"""""""""""""""""""""""""""
 # import libraries
 """"""""""""""""""""""""""""""
+import os
+import warnings
+warnings.filterwarnings('ignore')
+# ignore all warnings
+warnings.simplefilter("ignore")
+os.environ["PYTHONWARNINGS"] = "ignore"
+
 import os
 import sys
 import itertools
@@ -22,14 +29,10 @@
 from sklearn.model_selection import GridSearchCV
 import sklearn.metrics as skMetric
 import scipy.stats as stats
+import multiprocessing as mp
 
 from genepi.tools import randomized_l1
 
-import warnings
-warnings.filterwarnings('ignore')
-# ignore all future warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
 """"""""""""""""""""""""""""""
 # define functions 
 """"""""""""""""""""""""""""""
@@ -43,7 +46,7 @@ def RandomizedLogisticRegression(np_X, np_y):
     
     return estimator.scores_
 
-def LogisticRegressionL1CV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
+def LogisticRegressionL1CV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 1):
     X = np_X
     y = np_y
     X_sparse = coo_matrix(X)
@@ -57,8 +60,8 @@ def LogisticRegressionL1CV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 4):
         cost = [2**x for x in range(-8, 8)]
         parameters = [{'C':cost, 'penalty':['l1'], 'dual':[False], 'class_weight':['balanced']}]
         kf_estimator = KFold(n_splits=2)
-        estimator_logistic = linear_model.LogisticRegression()
-        estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=int_nJobs, cv=kf_estimator)
+        estimator_logistic = linear_model.LogisticRegression(max_iter=100, solver='liblinear')
+        estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=1, cv=kf_estimator)
         estimator_grid.fit(X[idxTr], y[idxTr])
         list_label = estimator_grid.best_estimator_.predict(X[idxTe])
         list_weight.append([float(item) for item in estimator_grid.best_estimator_.coef_[0]])
@@ -144,7 +147,7 @@ def FilterInLoading(np_genotype, np_phenotype):
 """"""""""""""""""""""""""""""
 # main function
 """"""""""""""""""""""""""""""
-def SingleGeneEpistasisLogistic(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):      
+def SingleGeneEpistasisLogistic(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):      
     ### set path of output file
     if str_outputFilePath == "":
         str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
@@ -253,7 +256,7 @@ def SingleGeneEpistasisLogistic(str_inputFileName_genotype, str_inputFileName_ph
     
     return float_f1Score
 
-def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 4):
+def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = mp.cpu_count()):
     ### set default output path
     if str_outputFilePath == "":
         str_outputFilePath = os.path.abspath(os.path.join(str_inputFilePath_genotype, os.pardir)) + "/singleGeneResult/"
@@ -266,7 +269,29 @@ def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileNa
     for str_fileName in os.listdir(str_inputFilePath_genotype):
         if ".gen" in str_fileName:
             list_genotypeFileName.append(str_fileName)
+
+    ### batch PolyLogisticRegression
+    ### inital multiprocessing pool
+    mp_pool = mp.Pool(int_nJobs)
     
+    ### apply pool on the function that need be parallelizing
+    dict_result = {}
+    for int_count_gene, float_f1Score in enumerate(mp_pool.starmap(SingleGeneEpistasisLogistic, [(os.path.join(str_inputFilePath_genotype, gene), str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) for gene in list_genotypeFileName]), 0):
+        if list_genotypeFileName[int_count_gene] not in dict_result:
+            dict_result[list_genotypeFileName[int_count_gene]] = float_f1Score
+        str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + list_genotypeFileName[int_count_gene] + ": " + "\t\t"
+        sys.stdout.write('%s\r' % str_print)
+        sys.stdout.flush()
+
+    mp_pool.close()
+
+    ### output result
+    with open(str_outputFilePath + "All_Logistic_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
+        file_outputFile.writelines("GeneSymbol,F1Score" + "\n")
+        for key, value in dict_result.items():
+            file_outputFile.writelines(key.split("_")[0] + "," + str(value) + "\n")
+
+    '''
     ### batch PolyLogisticRegression
     int_count_gene = 0
     with open(str_outputFilePath + "All_Logistic_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
@@ -279,5 +304,6 @@ def BatchSingleGeneEpistasisLogistic(str_inputFilePath_genotype, str_inputFileNa
             str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + item + ": " + str(float_f1Score) + "\t\t"
             sys.stdout.write('%s\r' % str_print)
             sys.stdout.flush()
+    '''
     
     print("step4: Detect single gene epistasis. DONE! \t\t\t\t")
diff --git a/genepi/step5_crossGeneEpistasis_Lasso.py b/genepi/step5_crossGeneEpistasis_Lasso.py
@@ -8,6 +8,13 @@
 """"""""""""""""""""""""""""""
 # import libraries
 """"""""""""""""""""""""""""""
+import os
+import warnings
+warnings.filterwarnings('ignore')
+# ignore all warnings
+warnings.simplefilter("ignore")
+os.environ["PYTHONWARNINGS"] = "ignore"
+
 import os
 import numpy as np
 np.seterr(divide='ignore', invalid='ignore')
@@ -21,11 +28,6 @@
 from genepi.step4_singleGeneEpistasis_Lasso import LassoRegressionCV
 from genepi.step4_singleGeneEpistasis_Lasso import FeatureEncoderLasso
 
-import warnings
-warnings.filterwarnings('ignore')
-# ignore all future warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
 """"""""""""""""""""""""""""""
 # define functions 
 """"""""""""""""""""""""""""""
@@ -39,7 +41,7 @@ def LassoRegression(np_X, np_y, int_nJobs = 4):
     alpha = np.logspace(-10, 10, 200)
     parameters = [{'alpha':alpha}]
     kf_estimator = KFold(n_splits=2)
-    estimator_lasso = linear_model.Lasso()
+    estimator_lasso = linear_model.Lasso(max_iter=1000)
     estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=int_nJobs, cv=kf_estimator)
     estimator_grid.fit(X, y)
     list_label = estimator_grid.best_estimator_.predict(X)
diff --git a/genepi/step5_crossGeneEpistasis_Logistic.py b/genepi/step5_crossGeneEpistasis_Logistic.py
@@ -8,6 +8,13 @@
 """"""""""""""""""""""""""""""
 # import libraries
 """"""""""""""""""""""""""""""
+import os
+import warnings
+warnings.filterwarnings('ignore')
+# ignore all warnings
+warnings.simplefilter("ignore")
+os.environ["PYTHONWARNINGS"] = "ignore"
+
 import os
 import numpy as np
 np.seterr(divide='ignore', invalid='ignore')
@@ -22,11 +29,6 @@
 from genepi.step4_singleGeneEpistasis_Logistic import LogisticRegressionL1CV
 from genepi.step4_singleGeneEpistasis_Logistic import FeatureEncoderLogistic
 
-import warnings
-warnings.filterwarnings('ignore')
-# ignore all future warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
 """"""""""""""""""""""""""""""
 # define functions 
 """"""""""""""""""""""""""""""
@@ -40,7 +42,7 @@ def LogisticRegressionL1(np_X, np_y, int_nJobs = 4):
     cost = [2**x for x in range(-8, 8)]
     parameters = [{'C':cost, 'penalty':['l1'], 'dual':[False], 'class_weight':['balanced']}]
     kf_estimator = KFold(n_splits=2)
-    estimator_logistic = linear_model.LogisticRegression()
+    estimator_logistic = linear_model.LogisticRegression(max_iter=100, solver='liblinear')
     estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=int_nJobs, cv=kf_estimator)
     estimator_grid.fit(X, y)
     list_label = estimator_grid.best_estimator_.predict(X)
diff --git a/genepi/step6_ensembleWithCovariates.py b/genepi/step6_ensembleWithCovariates.py
diff --git a/setup.py b/setup.py