Skip to content

Commit 94e134e

Browse files
Merge pull request #23 from ArnovanHilten/dev
Topology Update
2 parents 33408bd + 44a63d1 commit 94e134e

File tree

6 files changed

+226
-32
lines changed

6 files changed

+226
-32
lines changed

GenNet.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from GenNet_utils.Create_plots import plot
1010
from GenNet_utils.Train_network import train_classification, train_regression
1111
from GenNet_utils.Convert import convert
12+
from GenNet_utils.Topology import topology
1213

1314

1415
def main(args):
@@ -23,6 +24,9 @@ def main(args):
2324
plot(args)
2425
if args.mode == 'convert':
2526
convert(args)
27+
if args.mode == "topology":
28+
topology(args)
29+
2630

2731

2832
if __name__ == '__main__':
@@ -37,7 +41,7 @@ def main(args):
3741
parser_convert.add_argument('-variants', type=str,
3842
help="Path to file with row numbers of variants to include, if none is "
3943
"given all variants will be used", default=None)
40-
parser_convert.add_argument("-o", "--out", type=str, required=True, help="path to save result folder")
44+
parser_convert.add_argument("-o", "--out", type=str, default=os.getcwd() + '/processed_data/', help="path for saving the results, default ./processed_data")
4145
parser_convert.add_argument('-ID', action='store_true', default=False,
4246
help='Flag to convert minimac data to genotype per subject files first (default False)')
4347

@@ -116,7 +120,29 @@ def main(args):
116120
metavar="Layer_number:",
117121
default=0
118122
)
119-
123+
parser_topology = subparsers.add_parser("topology", help="Create standard topology files")
124+
parser_topology.add_argument(
125+
"type",
126+
default='create_annovar_input', type=str,
127+
choices=['create_annovar_input', 'create_gene_network'],
128+
help="Create annovar input, create gene network topology from annovar output"
129+
)
130+
parser_topology.add_argument(
131+
"path",
132+
type=str,
133+
help="Path to the input data. For create_annovar_input this is the folder containing hase: genotype, "
134+
"probes and individuals "
135+
)
136+
parser_topology.add_argument(
137+
'study_name',
138+
type=str,
139+
help='Study name used in Convert. Name of the files in the genotype individuals and probe folders'
140+
)
141+
parser_topology.add_argument(
142+
"-out",
143+
type=str,
144+
help="Path. Where to save the result, default ./processed_data",
145+
default=os.getcwd() + '/processed_data/'
146+
)
120147
args = parser.parse_args()
121-
122148
main(args)

GenNet_utils/Convert.py

+22-3
Original file line numberDiff line numberDiff line change
@@ -201,24 +201,43 @@ def transpose_genotype(args, hdf_name):
201201
print("Completed", args.study_name)
202202

203203

204+
def exclude_variants_probes(args):
205+
used_indices = pd.read_csv(args.variants, header=None)
206+
used_indices = used_indices.index.values[used_indices.values.flatten()]
207+
probes = pd.read_hdf(args.outfolder + '/probes/' + args.study_name + '.h5', mode="r")
208+
print("Probes shape", probes.shape)
209+
print("Selecting variants..")
210+
probes = probes.iloc[used_indices]
211+
print("Probes shape", probes.shape)
212+
probes.to_hdf(args.outfolder + '/probes/' + args.study_name + '_selected.h5', key='probes', format='table',
213+
data_columns=True, append=True,
214+
complib='zlib', complevel=9, min_itemsize=45)
215+
204216
def convert(args):
205-
hase_convert(args)
217+
# 1. hase
206218
if type(args.out) is list:
207219
args.outfolder = args.out[0]
208220
else:
209221
args.outfolder = args.out
210222

223+
if (os.path.exists(args.outfolder + '/probes/')) and (os.path.exists(args.outfolder + '/genotype/')) and (os.path.exists(args.outfolder + '/individuals/')):
224+
print("The folders: probes, genotype and individuals already exist. Data seems already in HASE format. Delete "
225+
"the folders if the files are not converted properly. Continuing with the current files:")
226+
else:
227+
hase_convert(args)
228+
229+
# 2. converting multiple lists into single string
211230
if type(args.study_name) is list:
212231
args.study_name = args.study_name[0]
213232
else:
214233
args.study_name = args.study_name
215234

216235
merge_hdf5_hase(args)
217236
hdf5_name = impute_hase_hdf5(args)
218-
219237
if args.variants is None:
220238
pass
239+
221240
else:
222241
hdf5_name = exclude_variants(args)
223-
242+
exclude_variants_probes(args)
224243
transpose_genotype(args, hdf_name=hdf5_name)

GenNet_utils/Create_network.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import tensorflow as tf
1212
import tensorflow.keras as K
1313
import scipy
14-
14+
import tables
1515
tf.keras.backend.set_epsilon(0.0000001)
1616
tf_version = tf.__version__ # ToDo use packaging.version
1717
if tf_version <= '1.13.1':
@@ -38,7 +38,9 @@ def layer_block(model, mask, i):
3838
columns = list(network_csv.columns.values)
3939
network_csv = network_csv.sort_values(by=columns, ascending=True)
4040

41-
inputsize = len(network_csv)
41+
h5file = tables.open_file(datapath + "genotype.h5", "r")
42+
inputsize = h5file.root.data.shape[1]
43+
h5file.close()
4244

4345
input_layer = K.Input((inputsize,), name='input_layer')
4446
model = K.layers.Reshape(input_shape=(inputsize,), target_shape=(inputsize, 1))(input_layer)
@@ -47,9 +49,11 @@ def layer_block(model, mask, i):
4749
network_csv2 = network_csv.drop_duplicates(columns[i])
4850
matrix_ones = np.ones(len(network_csv2[[columns[i], columns[i + 1]]]), np.bool)
4951
matrix_coord = (network_csv2[columns[i]].values, network_csv2[columns[i + 1]].values)
50-
mask = scipy.sparse.coo_matrix(((matrix_ones), matrix_coord),
51-
shape=(network_csv2[columns[i]].max() + 1,
52-
network_csv2[columns[i + 1]].max() + 1))
52+
if i == 0:
53+
matrixshape = (inputsize, network_csv2[columns[i + 1]].max() + 1)
54+
else:
55+
matrixshape = (network_csv2[columns[i]].max() + 1, network_csv2[columns[i + 1]].max() + 1)
56+
mask = scipy.sparse.coo_matrix(((matrix_ones), matrix_coord), shape = matrixshape)
5357
masks.append(mask)
5458
model = layer_block(model, mask, i)
5559

GenNet_utils/Create_plots.py

+43-21
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,6 @@
99
from GenNet_utils.Utility_functions import query_yes_no, get_paths
1010

1111

12-
def plot(args):
13-
folder, resultpath = get_paths(args.ID)
14-
importance_csv = pd.read_csv(resultpath + "/connection_weights.csv", index_col=0)
15-
layer = args.layer_n
16-
if args.type == "layer_weight":
17-
plot_layer_weight(resultpath, importance_csv, layer=layer, num_annotated=10)
18-
elif args.type == "circos":
19-
cicos_plot(resultpath=resultpath, importance_csv=importance_csv, plot_weights=False, plot_arrows=True)
20-
elif args.type == "raw_importance":
21-
manhattan_importance(resultpath=resultpath, importance_csv=importance_csv)
22-
else:
23-
print("invalid type:", args.type)
24-
exit()
25-
26-
2712
def cicos_plot(resultpath, importance_csv, plot_weights=True, plot_arrows=False):
2813
print("in progress...")
2914
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
@@ -81,7 +66,11 @@ def plot_layer_weight(resultpath, importance_csv, layer=0, num_annotated=10):
8166

8267
plt.figure(figsize=(20, 10))
8368
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
84-
color_end = np.sort(csv_file.groupby("node_layer_" + str(layer + 1))["node_layer_" + str(layer)].max().values)
69+
70+
if "chr" in csv_file.columns:
71+
color_end = np.sort(csv_file.groupby("chr")["node_layer_" + str(layer)].max().values)
72+
else:
73+
color_end = np.sort(csv_file.groupby("node_layer_" + str(layer + 1))["node_layer_" + str(layer)].max().values)
8574
color_end = np.insert(color_end, 0, 0)
8675

8776
csv_file = csv_file[["node_layer_" + str(layer), "node_layer_" + str(layer + 1), "weights_" + str(layer),
@@ -142,21 +131,38 @@ def plot_layer_weight(resultpath, importance_csv, layer=0, num_annotated=10):
142131
def manhattan_importance(resultpath, importance_csv, num_annotated=10):
143132
csv_file = importance_csv.copy()
144133
plt.figure(figsize=(20, 10))
145-
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
146-
color_end = np.sort(csv_file.groupby("node_layer_1")["node_layer_0"].max().values)
147-
color_end = np.insert(color_end, 0, 0)
134+
135+
gene_middle = []
136+
137+
if "chr" in csv_file.columns:
138+
color_end = np.sort(csv_file.groupby("chr")["node_layer_0"].max().values)
139+
print('coloring per chromosome')
140+
color_end = np.insert(color_end, 0, 0)
141+
for i in range(len(color_end) - 1):
142+
gene_middle.append((color_end[i] + color_end[i + 1]) / 2)
143+
else:
144+
color_end = np.sort(csv_file.groupby("node_layer_1")["node_layer_0"].max().values)
145+
color_end = np.insert(color_end, 0, 0)
146+
print("no chr information continuing by coloring per group in node_layer_1")
148147

149148
weights = abs(csv_file["raw_importance"])
150149
weights = weights / max(weights)
151150
x = np.arange(len(weights))
152151

152+
print(len(color_end), "color groups")
153+
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * len(color_end)
154+
153155
for i in range(len(color_end) - 1):
154156
plt.scatter(x[color_end[i]:color_end[i + 1]], weights[color_end[i]:color_end[i + 1]], c=colormap[i])
155157

156158
plt.ylim(bottom=0, top=1.2)
157159
plt.xlim(0, len(weights) + int(len(weights) / 100))
158-
plt.title("Raw importance for each path", size=36)
159-
plt.xlabel("Path", size=18)
160+
plt.title("Raw Importance Manhattan", size=36)
161+
if len(gene_middle) > 1:
162+
plt.xticks(gene_middle, np.arange(len(gene_middle)) + 1, size=16)
163+
plt.xlabel("Chromosome", size=18)
164+
else:
165+
plt.xlabel("Chromosome position", size=18)
160166
plt.ylabel("Weights", size=18)
161167

162168
csv_file["pos"] = x
@@ -179,3 +185,19 @@ def manhattan_importance(resultpath, importance_csv, num_annotated=10):
179185

180186
plt.savefig(resultpath + "Path_importance.png", bbox_inches='tight', pad_inches=0)
181187
plt.show()
188+
189+
190+
def plot(args):
191+
folder, resultpath = get_paths(args.ID)
192+
importance_csv = pd.read_csv(resultpath + "/connection_weights.csv", index_col=0)
193+
print(resultpath)
194+
layer = args.layer_n
195+
if args.type == "layer_weight":
196+
plot_layer_weight(resultpath, importance_csv, layer=layer, num_annotated=10)
197+
elif args.type == "circos":
198+
cicos_plot(resultpath=resultpath, importance_csv=importance_csv, plot_weights=False, plot_arrows=True)
199+
elif args.type == "raw_importance":
200+
manhattan_importance(resultpath=resultpath, importance_csv=importance_csv)
201+
else:
202+
print("invalid type:", args.type)
203+
exit()

GenNet_utils/Topology.py

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import os
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
7+
def Create_Annovar_input(args):
8+
hasepath = args.path
9+
studyname = args.study_name
10+
savepath = args.out
11+
12+
if os.path.exists(hasepath + '/probes/' + studyname + '_selected.h5'):
13+
probes = pd.read_hdf(hasepath + '/probes/' + studyname + '_selected.h5', mode="r")
14+
else:
15+
probes = pd.read_hdf(hasepath + '/probes/' + studyname + '.h5', mode="r")
16+
print(probes.shape)
17+
18+
if os.path.exists(hasepath + '/probes/' + studyname + '_hash_table.csv.gz'):
19+
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv.gz', compression="gzip", sep='\t')
20+
else:
21+
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv', sep='\t')
22+
23+
hashtable['allele1'] = hashtable['keys']
24+
unhashed_probes = probes.merge(hashtable, on='allele1', how="left")
25+
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele1"])
26+
unhashed_probes = unhashed_probes.rename(columns={'allele': 'allele1'})
27+
28+
# reload hashtable for other allele
29+
30+
if os.path.exists(hasepath + '/probes/' + studyname + '_hash_table.csv.gz'):
31+
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv.gz', compression="gzip", sep='\t')
32+
else:
33+
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv', sep='\t')
34+
35+
hashtable['allele2'] = hashtable['keys']
36+
unhashed_probes = unhashed_probes.merge(hashtable, on='allele2', how="left")
37+
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele2"])
38+
unhashed_probes = unhashed_probes.rename(columns={'allele': 'allele2'})
39+
40+
# clean
41+
annovar_input = unhashed_probes.drop(columns=["ID", "distance"])
42+
annovar_input["bp2"] = annovar_input["bp"]
43+
annovar_input["index_col"] = annovar_input.index
44+
annovar_input = annovar_input[['CHR', 'bp', "bp2", "allele1", "allele2", "index_col"]]
45+
46+
# print('Shape', annovar_input.shape)
47+
# if args.variants is None:
48+
# pass
49+
# else:
50+
# used_indices = pd.read_csv(args.variants, header=None)
51+
# used_indices = used_indices.index.values[used_indices.values.flatten()]
52+
# annovar_input = annovar_input.loc[annovar_input['index_col'].isin(used_indices)]
53+
# annovar_input['index_col'] = np.arange(len(annovar_input)) # after splitting out the unused variants the numbering needs to be reset to match the genotype matrix
54+
55+
print('Number of variants', annovar_input.shape)
56+
57+
annovar_input_path = savepath + '/annovar_input_' + studyname + '.csv'
58+
annovar_input.to_csv(annovar_input_path, sep="\t", index=False, header=False)
59+
60+
print('\n')
61+
print('Annovar input files ready \n')
62+
print("Install annovar: https://doc-openbio.readthedocs.io/projects/annovar/en/latest/user-guide/download/")
63+
print("Navigate to annovar, e.g cd /home/charlesdarwin/annovar/")
64+
print("Update annovar:\n perl annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/")
65+
print("Run:\n perl annotate_variation.pl -geneanno -dbtype refGene -buildver hg19 " + str(
66+
savepath) + "/annovar_input_" + str(studyname) + ".csv humandb --outfile " + str(savepath) + "/" + str(
67+
studyname) + "_RefGene")
68+
print('\n')
69+
print(
70+
'After obtaining the Annovar annotations, run topology create_gene_network to get the topology file for the SNPs-gene-output network:')
71+
72+
73+
def Create_gene_network_topology(args):
74+
datapath = args.path + '/'
75+
studyname = args.study_name
76+
savepath = args.out + '/'
77+
78+
print(args.study_name)
79+
80+
gene_annotation = pd.read_csv(datapath + str(studyname) + "_RefGene.variant_function", sep='\t', header=None)
81+
gene_annotation.columns = ['into/exonic', 'gene', 'chr', 'bps', 'bpe', "mutation1", "mutation2", 'index_col']
82+
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\,.*", "")
83+
# gene_annotation['dist'] = gene_annotation['gene'].str.extract(r"(?<=dist\=)(.*)(?=\))")
84+
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\(.*\)", "")
85+
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\(.*", "")
86+
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\;.*", "")
87+
gene_annotation = gene_annotation[(gene_annotation['gene'] != "NONE")]
88+
gene_annotation = gene_annotation.dropna()
89+
90+
gene_list = gene_annotation.drop_duplicates("gene")
91+
gene_list = gene_list.sort_values(by=["chr", "bps"], ascending=[True, True])
92+
gene_list["gene_id"] = np.arange(len(gene_list))
93+
gene_list = gene_list[["gene", "gene_id"]]
94+
95+
gene_annotation = gene_annotation.merge(gene_list, on="gene")
96+
gene_annotation = gene_annotation.sort_values(by="index_col", ascending=True)
97+
98+
gene_annotation = gene_annotation.assign(
99+
chrbp='chr' + gene_annotation.chr.astype(str) + ':' + gene_annotation.bps.astype(str))
100+
gene_annotation.to_csv(savepath + "/gene_network_description.csv")
101+
102+
topology = gene_annotation[["chr", "index_col", "chrbp", "gene_id", "gene"]]
103+
print(topology['index_col'].max())
104+
topology.columns = ['chr', 'layer0_node', 'layer0_name', 'layer1_node', 'layer1_name']
105+
106+
107+
topology.to_csv(savepath + "/topology.csv")
108+
109+
print('Topology file saved:', savepath + "/topology.csv")
110+
111+
112+
def topology(args):
113+
if args.type == 'create_annovar_input':
114+
Create_Annovar_input(args)
115+
elif args.type == 'create_gene_network':
116+
Create_gene_network_topology(args)
117+
else:
118+
print("invalid type:", args.type)
119+
exit()

GenNet_utils/Utility_functions.py

+4
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ def create_importance_csv(datapath, model, masks):
131131
coordinate_list = []
132132
for i, mask in zip(np.arange(len(masks)), masks):
133133
coordinates = pd.DataFrame([])
134+
135+
if (i == 0):
136+
if 'chr' in network_csv.columns:
137+
coordinates["chr"] = network_csv["chr"]
134138
coordinates["node_layer_" + str(i)] = mask.row
135139
coordinates["node_layer_" + str(i + 1)] = mask.col
136140
coordinates = coordinates.sort_values("node_layer_" + str(i), ascending=True)

0 commit comments

Comments
 (0)