Merge pull request #23 from ArnovanHilten/dev

ArnovanHilten · web-flow · commit 94e134ecb4ef · 2020-09-28T15:51:23.000+02:00
Topology Update
diff --git a/GenNet.py b/GenNet.py
@@ -9,6 +9,7 @@
 from GenNet_utils.Create_plots import plot
 from GenNet_utils.Train_network import train_classification, train_regression
 from GenNet_utils.Convert import convert
+from GenNet_utils.Topology import topology
 
 
 def main(args):
@@ -23,6 +24,9 @@ def main(args):
         plot(args)
     if args.mode == 'convert':
         convert(args)
+    if args.mode == "topology":
+        topology(args)
+
 
 
 if __name__ == '__main__':
@@ -37,7 +41,7 @@ def main(args):
     parser_convert.add_argument('-variants', type=str,
                                 help="Path to file with row numbers of variants to include, if none is "
                                      "given all variants will be used", default=None)
-    parser_convert.add_argument("-o", "--out", type=str, required=True, help="path to save result folder")
+    parser_convert.add_argument("-o", "--out", type=str, default=os.getcwd() + '/processed_data/', help="path for saving the results, default ./processed_data")
     parser_convert.add_argument('-ID', action='store_true', default=False,
                                 help='Flag to convert minimac data to genotype per subject files first (default False)')
 
@@ -116,7 +120,29 @@ def main(args):
         metavar="Layer_number:",
         default=0
     )
-
+    parser_topology = subparsers.add_parser("topology", help="Create standard topology files")
+    parser_topology.add_argument(
+        "type",
+        default='create_annovar_input', type=str,
+        choices=['create_annovar_input', 'create_gene_network'],
+        help="Create annovar input, create gene network topology from annovar output"
+    )
+    parser_topology.add_argument(
+        "path",
+        type=str,
+        help="Path to the input data. For create_annovar_input this is the folder containing hase: genotype, "
+             "probes and individuals "
+    )
+    parser_topology.add_argument(
+        'study_name',
+        type=str,
+        help='Study name used in Convert. Name of the files in the genotype individuals and probe folders'
+    )
+    parser_topology.add_argument(
+        "-out",
+        type=str,
+        help="Path. Where to save the result, default ./processed_data",
+        default=os.getcwd() + '/processed_data/'
+    )
     args = parser.parse_args()
-
     main(args)
diff --git a/GenNet_utils/Convert.py b/GenNet_utils/Convert.py
@@ -201,24 +201,43 @@ def transpose_genotype(args, hdf_name):
     print("Completed", args.study_name)
 
 
+def exclude_variants_probes(args):
+    used_indices = pd.read_csv(args.variants, header=None)
+    used_indices = used_indices.index.values[used_indices.values.flatten()]
+    probes = pd.read_hdf(args.outfolder + '/probes/' + args.study_name + '.h5', mode="r")
+    print("Probes shape", probes.shape)
+    print("Selecting variants..")
+    probes = probes.iloc[used_indices]
+    print("Probes shape", probes.shape)
+    probes.to_hdf(args.outfolder + '/probes/' + args.study_name + '_selected.h5', key='probes', format='table',
+                         data_columns=True, append=True,
+                         complib='zlib', complevel=9, min_itemsize=45)
+
 def convert(args):
-    hase_convert(args)
+    # 1. hase
     if type(args.out) is list:
         args.outfolder = args.out[0]
     else:
         args.outfolder = args.out
 
+    if (os.path.exists(args.outfolder + '/probes/')) and (os.path.exists(args.outfolder + '/genotype/')) and (os.path.exists(args.outfolder + '/individuals/')):
+        print("The folders: probes, genotype and individuals already exist. Data seems already in HASE format. Delete "
+              "the folders if the files are not converted properly. Continuing with the current files:")
+    else:
+        hase_convert(args)
+
+    # 2. converting multiple lists into single string
     if type(args.study_name) is list:
         args.study_name = args.study_name[0]
     else:
         args.study_name = args.study_name
 
     merge_hdf5_hase(args)
     hdf5_name = impute_hase_hdf5(args)
-
     if args.variants is None:
         pass
+
     else:
         hdf5_name = exclude_variants(args)
-
+        exclude_variants_probes(args)
     transpose_genotype(args, hdf_name=hdf5_name)
diff --git a/GenNet_utils/Create_network.py b/GenNet_utils/Create_network.py
@@ -11,7 +11,7 @@
 import tensorflow as tf
 import tensorflow.keras as K
 import scipy
-
+import tables
 tf.keras.backend.set_epsilon(0.0000001)
 tf_version = tf.__version__  # ToDo use packaging.version
 if tf_version <= '1.13.1':
@@ -38,7 +38,9 @@ def layer_block(model, mask, i):
     columns = list(network_csv.columns.values)
     network_csv = network_csv.sort_values(by=columns, ascending=True)
 
-    inputsize = len(network_csv)
+    h5file = tables.open_file(datapath + "genotype.h5", "r")
+    inputsize = h5file.root.data.shape[1]
+    h5file.close()
 
     input_layer = K.Input((inputsize,), name='input_layer')
     model = K.layers.Reshape(input_shape=(inputsize,), target_shape=(inputsize, 1))(input_layer)
@@ -47,9 +49,11 @@ def layer_block(model, mask, i):
         network_csv2 = network_csv.drop_duplicates(columns[i])
         matrix_ones = np.ones(len(network_csv2[[columns[i], columns[i + 1]]]), np.bool)
         matrix_coord = (network_csv2[columns[i]].values, network_csv2[columns[i + 1]].values)
-        mask = scipy.sparse.coo_matrix(((matrix_ones), matrix_coord),
-                                       shape=(network_csv2[columns[i]].max() + 1,
-                                              network_csv2[columns[i + 1]].max() + 1))
+        if i == 0:
+            matrixshape = (inputsize, network_csv2[columns[i + 1]].max() + 1)
+        else:
+            matrixshape = (network_csv2[columns[i]].max() + 1, network_csv2[columns[i + 1]].max() + 1)
+        mask = scipy.sparse.coo_matrix(((matrix_ones), matrix_coord), shape = matrixshape)
         masks.append(mask)
         model = layer_block(model, mask, i)
 
diff --git a/GenNet_utils/Create_plots.py b/GenNet_utils/Create_plots.py
@@ -9,21 +9,6 @@
 from GenNet_utils.Utility_functions import query_yes_no, get_paths
 
 
-def plot(args):
-    folder, resultpath = get_paths(args.ID)
-    importance_csv = pd.read_csv(resultpath + "/connection_weights.csv", index_col=0)
-    layer = args.layer_n
-    if args.type == "layer_weight":
-        plot_layer_weight(resultpath, importance_csv, layer=layer, num_annotated=10)
-    elif args.type == "circos":
-        cicos_plot(resultpath=resultpath, importance_csv=importance_csv, plot_weights=False, plot_arrows=True)
-    elif args.type == "raw_importance":
-        manhattan_importance(resultpath=resultpath, importance_csv=importance_csv)
-    else:
-        print("invalid type:", args.type)
-        exit()
-
-
 def cicos_plot(resultpath, importance_csv, plot_weights=True, plot_arrows=False):
     print("in progress...")
     colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
@@ -81,7 +66,11 @@ def plot_layer_weight(resultpath, importance_csv, layer=0, num_annotated=10):
 
     plt.figure(figsize=(20, 10))
     colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
-    color_end = np.sort(csv_file.groupby("node_layer_" + str(layer + 1))["node_layer_" + str(layer)].max().values)
+
+    if "chr" in csv_file.columns:
+        color_end = np.sort(csv_file.groupby("chr")["node_layer_" + str(layer)].max().values)
+    else:
+        color_end = np.sort(csv_file.groupby("node_layer_" + str(layer + 1))["node_layer_" + str(layer)].max().values)
     color_end = np.insert(color_end, 0, 0)
 
     csv_file = csv_file[["node_layer_" + str(layer), "node_layer_" + str(layer + 1), "weights_" + str(layer),
@@ -142,21 +131,38 @@ def plot_layer_weight(resultpath, importance_csv, layer=0, num_annotated=10):
 def manhattan_importance(resultpath, importance_csv, num_annotated=10):
     csv_file = importance_csv.copy()
     plt.figure(figsize=(20, 10))
-    colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
-    color_end = np.sort(csv_file.groupby("node_layer_1")["node_layer_0"].max().values)
-    color_end = np.insert(color_end, 0, 0)
+
+    gene_middle = []
+
+    if "chr" in csv_file.columns:
+        color_end = np.sort(csv_file.groupby("chr")["node_layer_0"].max().values)
+        print('coloring per chromosome')
+        color_end = np.insert(color_end, 0, 0)
+        for i in range(len(color_end) - 1):
+            gene_middle.append((color_end[i] + color_end[i + 1]) / 2)
+    else:
+        color_end = np.sort(csv_file.groupby("node_layer_1")["node_layer_0"].max().values)
+        color_end = np.insert(color_end, 0, 0)
+        print("no chr information continuing by coloring per group in node_layer_1")
 
     weights = abs(csv_file["raw_importance"])
     weights = weights / max(weights)
     x = np.arange(len(weights))
 
+    print(len(color_end), "color groups")
+    colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * len(color_end)
+
     for i in range(len(color_end) - 1):
         plt.scatter(x[color_end[i]:color_end[i + 1]], weights[color_end[i]:color_end[i + 1]], c=colormap[i])
 
     plt.ylim(bottom=0, top=1.2)
     plt.xlim(0, len(weights) + int(len(weights) / 100))
-    plt.title("Raw importance for each path", size=36)
-    plt.xlabel("Path", size=18)
+    plt.title("Raw Importance Manhattan", size=36)
+    if len(gene_middle) > 1:
+        plt.xticks(gene_middle, np.arange(len(gene_middle)) + 1, size=16)
+        plt.xlabel("Chromosome", size=18)
+    else:
+        plt.xlabel("Chromosome position", size=18)
     plt.ylabel("Weights", size=18)
 
     csv_file["pos"] = x
@@ -179,3 +185,19 @@ def manhattan_importance(resultpath, importance_csv, num_annotated=10):
 
     plt.savefig(resultpath + "Path_importance.png", bbox_inches='tight', pad_inches=0)
     plt.show()
+
+
+def plot(args):
+    folder, resultpath = get_paths(args.ID)
+    importance_csv = pd.read_csv(resultpath + "/connection_weights.csv", index_col=0)
+    print(resultpath)
+    layer = args.layer_n
+    if args.type == "layer_weight":
+        plot_layer_weight(resultpath, importance_csv, layer=layer, num_annotated=10)
+    elif args.type == "circos":
+        cicos_plot(resultpath=resultpath, importance_csv=importance_csv, plot_weights=False, plot_arrows=True)
+    elif args.type == "raw_importance":
+        manhattan_importance(resultpath=resultpath, importance_csv=importance_csv)
+    else:
+        print("invalid type:", args.type)
+        exit()
diff --git a/GenNet_utils/Topology.py b/GenNet_utils/Topology.py
@@ -0,0 +1,119 @@
+import os
+
+import numpy as np
+import pandas as pd
+
+
+def Create_Annovar_input(args):
+    hasepath = args.path
+    studyname = args.study_name
+    savepath = args.out
+
+    if os.path.exists(hasepath + '/probes/' + studyname + '_selected.h5'):
+        probes = pd.read_hdf(hasepath + '/probes/' + studyname + '_selected.h5', mode="r")
+    else:
+        probes = pd.read_hdf(hasepath + '/probes/' + studyname + '.h5', mode="r")
+        print(probes.shape)
+
+    if os.path.exists(hasepath + '/probes/' + studyname + '_hash_table.csv.gz'):
+        hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv.gz', compression="gzip", sep='\t')
+    else:
+        hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv', sep='\t')
+
+    hashtable['allele1'] = hashtable['keys']
+    unhashed_probes = probes.merge(hashtable, on='allele1', how="left")
+    unhashed_probes = unhashed_probes.drop(columns=["keys", "allele1"])
+    unhashed_probes = unhashed_probes.rename(columns={'allele': 'allele1'})
+
+    # reload hashtable for other allele
+
+    if os.path.exists(hasepath + '/probes/' + studyname + '_hash_table.csv.gz'):
+        hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv.gz', compression="gzip", sep='\t')
+    else:
+        hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv', sep='\t')
+
+    hashtable['allele2'] = hashtable['keys']
+    unhashed_probes = unhashed_probes.merge(hashtable, on='allele2', how="left")
+    unhashed_probes = unhashed_probes.drop(columns=["keys", "allele2"])
+    unhashed_probes = unhashed_probes.rename(columns={'allele': 'allele2'})
+
+    # clean
+    annovar_input = unhashed_probes.drop(columns=["ID", "distance"])
+    annovar_input["bp2"] = annovar_input["bp"]
+    annovar_input["index_col"] = annovar_input.index
+    annovar_input = annovar_input[['CHR', 'bp', "bp2", "allele1", "allele2", "index_col"]]
+
+    # print('Shape', annovar_input.shape)
+    # if args.variants is None:
+    #     pass
+    # else:
+    #     used_indices = pd.read_csv(args.variants, header=None)
+    #     used_indices = used_indices.index.values[used_indices.values.flatten()]
+    #     annovar_input = annovar_input.loc[annovar_input['index_col'].isin(used_indices)]
+    #     annovar_input['index_col'] = np.arange(len(annovar_input))     # after splitting out the unused variants the numbering needs to be reset to match the genotype matrix
+
+    print('Number of variants', annovar_input.shape)
+
+    annovar_input_path = savepath + '/annovar_input_' + studyname + '.csv'
+    annovar_input.to_csv(annovar_input_path, sep="\t", index=False, header=False)
+
+    print('\n')
+    print('Annovar input files ready \n')
+    print("Install annovar: https://doc-openbio.readthedocs.io/projects/annovar/en/latest/user-guide/download/")
+    print("Navigate to annovar, e.g cd /home/charlesdarwin/annovar/")
+    print("Update annovar:\n perl annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/")
+    print("Run:\n perl annotate_variation.pl -geneanno -dbtype refGene -buildver hg19 " + str(
+        savepath) + "/annovar_input_" + str(studyname) + ".csv humandb --outfile " + str(savepath) + "/" + str(
+        studyname) + "_RefGene")
+    print('\n')
+    print(
+        'After obtaining the Annovar annotations, run topology create_gene_network to get the topology file for the SNPs-gene-output network:')
+
+
+def Create_gene_network_topology(args):
+    datapath = args.path + '/'
+    studyname = args.study_name
+    savepath = args.out + '/'
+
+    print(args.study_name)
+
+    gene_annotation = pd.read_csv(datapath + str(studyname) + "_RefGene.variant_function", sep='\t', header=None)
+    gene_annotation.columns = ['into/exonic', 'gene', 'chr', 'bps', 'bpe', "mutation1", "mutation2", 'index_col']
+    gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\,.*", "")
+    # gene_annotation['dist'] = gene_annotation['gene'].str.extract(r"(?<=dist\=)(.*)(?=\))")
+    gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\(.*\)", "")
+    gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\(.*", "")
+    gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\;.*", "")
+    gene_annotation = gene_annotation[(gene_annotation['gene'] != "NONE")]
+    gene_annotation = gene_annotation.dropna()
+
+    gene_list = gene_annotation.drop_duplicates("gene")
+    gene_list = gene_list.sort_values(by=["chr", "bps"], ascending=[True, True])
+    gene_list["gene_id"] = np.arange(len(gene_list))
+    gene_list = gene_list[["gene", "gene_id"]]
+
+    gene_annotation = gene_annotation.merge(gene_list, on="gene")
+    gene_annotation = gene_annotation.sort_values(by="index_col", ascending=True)
+
+    gene_annotation = gene_annotation.assign(
+        chrbp='chr' + gene_annotation.chr.astype(str) + ':' + gene_annotation.bps.astype(str))
+    gene_annotation.to_csv(savepath + "/gene_network_description.csv")
+
+    topology = gene_annotation[["chr", "index_col", "chrbp", "gene_id", "gene"]]
+    print(topology['index_col'].max())
+    topology.columns = ['chr', 'layer0_node', 'layer0_name', 'layer1_node', 'layer1_name']
+
+
+    topology.to_csv(savepath + "/topology.csv")
+
+    print('Topology file saved:', savepath + "/topology.csv")
+
+
+def topology(args):
+    if args.type == 'create_annovar_input':
+        Create_Annovar_input(args)
+    elif args.type == 'create_gene_network':
+        Create_gene_network_topology(args)
+    else:
+        print("invalid type:", args.type)
+        exit()
diff --git a/GenNet_utils/Utility_functions.py b/GenNet_utils/Utility_functions.py
@@ -131,6 +131,10 @@ def create_importance_csv(datapath, model, masks):
     coordinate_list = []
     for i, mask in zip(np.arange(len(masks)), masks):
         coordinates = pd.DataFrame([])
+
+        if (i == 0):
+            if 'chr' in network_csv.columns:
+                coordinates["chr"] = network_csv["chr"]
         coordinates["node_layer_" + str(i)] = mask.row
         coordinates["node_layer_" + str(i + 1)] = mask.col
         coordinates = coordinates.sort_values("node_layer_" + str(i), ascending=True)