ArnovanHilten
diff --git a/‎.github/workflows/tests.yml
+1-1 b/‎.github/workflows/tests.yml
+1-1
diff --git a/‎GenNet.py
+63-7 b/‎GenNet.py
+63-7
diff --git a/‎GenNet_utils/Convert.py
+3-3 b/‎GenNet_utils/Convert.py
+3-3
diff --git a/‎GenNet_utils/Convert_topology_npz.py
+48 b/‎GenNet_utils/Convert_topology_npz.py
+48
diff --git a/‎GenNet_utils/Create_network.py
+73-5 b/‎GenNet_utils/Create_network.py
+73-5
diff --git a/‎GenNet_utils/Dataloader.py
+22-19 b/‎GenNet_utils/Dataloader.py
+22-19
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7", "3.8"]
+        python-version: ["3.7", "3.8", "3.9"]
 
     steps:
       - uses: actions/checkout@v3
 
@@ -6,28 +6,33 @@
 import argparse
 
 sys.path.insert(1, os.path.dirname(os.getcwd()) + "/GenNet_utils/")
-from GenNet_utils.Create_plots import plot
-from GenNet_utils.Train_network import train_classification, train_regression
-from GenNet_utils.Convert import convert
-from GenNet_utils.Topology import topology
 
 
 def main():
     args = ArgumentParser().parse_cmd_args()
 
     if args.mode == 'train':
         if args.problem_type == "classification":
-            train_classification(args)
+            args.regression = False
         elif args.problem_type == "regression":
-            train_regression(args)
+            args.regression = True
         else:
             print('something went wrong invalid problem type', args.problem_type)
+        from GenNet_utils.Train_network import train_model
+        train_model(args)
+        
     elif args.mode == "plot":
+        from GenNet_utils.Create_plots import plot
         plot(args)
     if args.mode == 'convert':
+        from GenNet_utils.Convert import convert
         convert(args)
     if args.mode == "topology":
+        from GenNet_utils.Topology import topology
         topology(args)
+    if args.mode == "interpret":
+        from GenNet_utils.Interpret import interpret
+        interpret(args)
 
 
 class ArgumentParser():
@@ -51,6 +56,9 @@ def __init__(self):
         parser_topology = subparsers.add_parser("topology", help="Create standard topology files")
         self.make_parser_topology(parser_topology)
 
+        parser_interpret = subparsers.add_parser("interpret", help="Post-hoc interpretation analysis on the network")
+        self.make_parser_interpret(parser_interpret)
+
         self.parser = parser
 
     def parse_cmd_args(self):
@@ -239,7 +247,11 @@ def make_parser_train(self, parser_train):
             action='store_true',
             default=False,
             help='Flag for one hot encoding as a first layer in the network')        
-   
+        parser_train.add_argument(
+            "-init_linear",
+            action='store_true',
+            default=False,
+            help='initialize the one-hot encoding for the neural network with a linear assumption')
         return parser_train
 
     def make_parser_plot(self, parser_plot):
@@ -298,5 +310,49 @@ def make_parser_topology(self, parser_topology):
         return parser_topology
 
 
+
+    def make_parser_interpret(self, parser_topology):
+        parser_topology.add_argument(
+            "-type",
+            default='get_weight_scores', type=str,
+            choices=['get_weight_scores', 'NID', 'RLIPP', 'DFIM',"PathExplain","DeepExplain"],
+            help="choose interpretation method, choice")
+        parser_topology.add_argument(
+            "-resultpath",
+            type=str,
+            required=True,
+            help="Path to the folder with the trained network (resultfolder) ")
+        parser_topology.add_argument(
+            '-layer',
+            type=int,
+            required=False,
+            help='Select a layer for interpretation only necessary for NID')
+        parser_topology.add_argument(
+            '-num_eval',
+            type=int,
+            required=False,
+            default = 100,
+            help='Select the number of SNPs to eval in DFIM')
+        parser_topology.add_argument(
+            '-start_rank',
+            type=int,
+            required=False,
+            default = 0,
+            help='Multiprocessing, start from Nth ranked important variant')
+        parser_topology.add_argument(
+            '-end_rank',
+            type=int,
+            required=False,
+            default = 0,
+            help='Multiprocessing, stop at Nth ranked important SNP')
+        parser_topology.add_argument(
+            '-num_sample_pat',
+            type=int,
+            required=False,
+            default = 1000,
+            help='Select a number of patients to sample for DFIM')
+        return parser_topology
+
+
 if __name__ == '__main__':
     main()
@@ -75,7 +75,7 @@ def merge_hdf5_hase(args):
     f = tables.open_file(args.outfolder + args.study_name + '_step2_merged_genotype.h5', mode='a')
     for i in tqdm.tqdm(range(number_of_files)):
         gen_tmp = h5py.File(filepath_hase.format(i), 'r')['genotype']
-        f.root.data.append(np.array(np.round(gen_tmp[:, :]), dtype=np.int))
+        f.root.data.append(np.array(np.round(gen_tmp[:, :]), dtype=int))
     f.close()
 
     args.outfolder = args.genotype
@@ -365,7 +365,7 @@ def merge_transpose(args):
         print("chunking is not necessary")
         for job_n in tqdm.tqdm(range(args.n_jobs)):
             gen_tmp = tables.open_file(args.genotype + args.study_name + '_step5_genotype_transposed_' + str(job_n) + '.h5', mode='r')
-            f.root.data.append(np.array(np.round(gen_tmp.root.data[:, :]), dtype=np.int))
+            f.root.data.append(np.array(np.round(gen_tmp.root.data[:, :]), dtype=int))
             gen_tmp.close()
         f.close()
     else:
@@ -375,7 +375,7 @@ def merge_transpose(args):
             for chunckblock in range(int(np.ceil(gen_tmp.root.data.shape[0] / chunk))):
                 begins = chunckblock * chunk
                 tills = min(((chunckblock + 1) * chunk), gen_tmp.root.data.shape[0])
-                f.root.data.append(np.array(np.round(gen_tmp.root.data[begins:tills, :]), dtype=np.int))
+                f.root.data.append(np.array(np.round(gen_tmp.root.data[begins:tills, :]), dtype=int))
             gen_tmp.close()
         f.close()
     print("completed")
 
@@ -0,0 +1,48 @@
+import argparse
+import numpy as np
+import pandas as pd
+import os
+import argparse
+import numpy as np
+import pandas as pd
+import os
+from scipy import sparse
+
+
+def main():
+    """
+    args: 
+    snp: the name of the column in the topology.csv dataset with the ID for the SNP column
+    gene: the name of the column in the topology.csv dataset with the ID for the gene column
+    direc: (Optional) the directory where the topology.csv file is located, if omitted it takes the current directory
+    file_name: (Optional) the name of the file to save as, defaults to "SNP_gene_mask"
+
+    Return: SNP_gene_mask.npz, the .npz file corresponding to the topology.csv
+    """
+    parser = argparse.ArgumentParser(description="A simple script with command-line arguments")
+    parser.add_argument("--snp", help="Your snp", required=True)
+    parser.add_argument("--gene", help="Your gene", required=True)
+    parser.add_argument("--direc", help="Your Directory", required=False)
+    parser.add_argument("--file_name", help="Your file name", default="SNP_gene_mask", required=False)
+    args = parser.parse_args()
+    
+    if args.direc:
+        try:
+            os.chdir(args.direc)
+            print(f"Navigated to directory: {os.getcwd()}")
+        except FileNotFoundError:
+            print(f"Directory '{args.direc}' not found.")
+    
+    snp_level = args.snp
+    gene_level = args.gene
+    topology = pd.read_csv("topology.csv")
+    data = np.ones(len(topology), np.bool)
+    coord = (topology[snp_level].values, topology[gene_level].values)
+    SNP_gene_matrix = sparse.coo_matrix(((data), coord), shape=(topology[snp_level].max()+1, topology[gene_level].max()+1))
+    file_name = args.file_name
+    sparse.save_npz(file_name, SNP_gene_matrix)
+
+if __name__ == "__main__":
+    main()
+
+
@@ -90,8 +90,8 @@ def one_hot_input(input_layer):
 def add_covariates(model, input_cov, num_covariates, regression, negative_values_ytrain, mean_ytrain, l1_value, L1_act):
     if num_covariates > 0:
         model = activation_layer(model, regression, negative_values_ytrain)
-        model = K.layers.concatenate([model, input_cov], axis=1)
-        model = K.layers.BatchNormalization(center=False, scale=False)(model)
+        model = K.layers.concatenate([model, input_cov], axis=1, name="concatenate_cov")
+        model = K.layers.BatchNormalization(center=False, scale=False, name="batchnorm_cov")(model)
         model = K.layers.Dense(units=1, name="output_layer_cov",
                        kernel_regularizer=tf.keras.regularizers.l1(l=l1_value),
                        activity_regularizer=K.regularizers.l1(L1_act),
@@ -232,7 +232,7 @@ def create_network_from_csv(datapath,
         model = K.layers.Reshape(input_shape=(inputsize,), target_shape=(inputsize, 1))(input_layer)
 
     for i in range(len(columns) - 1):
-        matrix_ones = np.ones(len(network_csv[[columns[i], columns[i + 1]]]), np.bool)
+        matrix_ones = np.ones(len(network_csv[[columns[i], columns[i + 1]]]), bool)
         matrix_coord = (network_csv[columns[i]].values, network_csv[columns[i + 1]].values)
         if i == 0:
             matrixshape = (inputsize, network_csv[columns[i + 1]].max() + 1)
@@ -331,7 +331,7 @@ def gene_network_multiple_filters(datapath,
         mean_ytrain = 0
         negative_values_ytrain = False
 
-    print("height_multiple_filters with", filters, "filters")
+    print("gene_network_multiple_filters with", filters, "filters")
 
     masks = []
     for npz_path in glob.glob(datapath + '/*.npz'):
@@ -482,4 +482,72 @@ def regression_height(inputsize, num_covariates=2, l1_value=0.001):
     print(model.summary())
 
     return model, masks
-    
+    
+
+
+def remove_batchnorm_model(model, masks, keep_cov = False):
+    original_model = model
+    inputs = tf.keras.Input(shape=original_model.input_shape[0][1:])
+    x = inputs
+
+    mask_num = 0
+    for layer in original_model.layers[1:]: 
+        # Skip BatchNormalization layers
+        if not isinstance(layer, tf.keras.layers.BatchNormalization):
+            # Handle LocallyDirected1D layer with custom arguments
+            if isinstance(layer, LocallyDirected1D):
+                config = layer.get_config()
+                new_layer = LocallyDirected1D(filters=config['filters'], 
+                                                mask=masks[mask_num],
+                                                name=config['name'])
+                x = new_layer(x)
+                mask_num = mask_num + 1
+            elif "_cov" in layer.name and not keep_cov:
+                pass
+            else:
+                # Add other layers as they are
+                x = layer.__class__.from_config(layer.get_config())(x)
+
+    # Create the new model
+    new_model = tf.keras.Model(inputs=inputs, outputs=x)
+
+    original_model_layers = [x for x in original_model.layers if not isinstance(x, tf.keras.layers.BatchNormalization)]
+
+    for new_layer, layer in zip(new_model.layers, original_model_layers): 
+        new_layer.set_weights(layer.get_weights())
+
+    print(new_model.summary())
+        
+    return new_model
+
+
+def remove_cov(model, masks):
+    original_model = model
+    inputs = tf.keras.Input(shape=original_model.input_shape[0][1:])
+    x = inputs
+
+    mask_num = 0
+    for layer in original_model.layers[1:]: 
+        # Skip BatchNormalization layers
+        if isinstance(layer, LocallyDirected1D):
+            config = layer.get_config()
+            new_layer = LocallyDirected1D(filters=config['filters'], 
+                                            mask=masks[mask_num],
+                                            name=config['name'])
+            x = new_layer(x)
+            mask_num = mask_num + 1
+        elif "_cov" in layer.name:
+            pass
+        else:
+            # Add other layers as they are
+            x = layer.__class__.from_config(layer.get_config())(x)
+
+    # Create the new model
+    new_model = tf.keras.Model(inputs=inputs, outputs=x)
+
+    for new_layer, layer in zip(new_model.layers, original_model.layers ): 
+        new_layer.set_weights(layer.get_weights())
+
+    print(new_model.summary())
+        
+    return new_model
@@ -92,19 +92,6 @@ def get_labels(datapath, set_number):
     return ybatch
 
 
-def get_data(datapath, genotype_path, set_number):
-    print("depreciated")
-    groundtruth = pd.read_csv(datapath + "/subjects.csv")
-    h5file = tables.open_file(genotype_path + "genotype.h5", "r")
-    groundtruth = groundtruth[groundtruth["set"] == set_number]
-    xbatchid = np.array(groundtruth["genotype_row"].values, dtype=np.int64)
-    xbatch = h5file.root.data[xbatchid, :]
-    ybatch = np.reshape(np.array(groundtruth["labels"].values), (-1, 1))
-    h5file.close()
-    return xbatch, ybatch
-
-
-
 
 
 class TrainDataGenerator(K.utils.Sequence):
@@ -160,7 +147,7 @@ def single_genotype_matrix(self, idx):
         ybatch = self.training_subjects["labels"].iloc[batchindexes]
         xcov = self.training_subjects.filter(like="cov_").iloc[batchindexes]
         xcov = xcov.values
-        xbatchid = np.array(self.training_subjects["genotype_row"].iloc[batchindexes], dtype=np.int64)
+        xbatchid = np.array(self.training_subjects["genotype_row"].iloc[batchindexes], dtype=int)
         xbatch = genotype_hdf.root.data[xbatchid, :] 
         xbatch = self.if_one_hot(xbatch)
         ybatch = np.reshape(np.array(ybatch), (-1, 1))
@@ -181,7 +168,7 @@ def multi_genotype_matrix(self, idx):
         for i in subjects_current_batch["chunk_id"].unique():
             genotype_hdf = tables.open_file(self.genotype_path + "/" + str(i) + self.h5filenames + ".h5", "r")
             subjects_current_chunk = subjects_current_batch[subjects_current_batch["chunk_id"] == i]
-            xbatchid = np.array(subjects_current_chunk["genotype_row"].values, dtype=np.int64)
+            xbatchid = np.array(subjects_current_chunk["genotype_row"].values, dtype=int)
             if len(xbatchid) > 1:
                 pass
             else:
@@ -252,21 +239,20 @@ def if_one_hot(self, xbatch):
             else:
                 print("unexpected shape!")   
         return xbatch
-    
+
     def single_genotype_matrix(self, idx):
         genotype_hdf = tables.open_file(self.genotype_path + "/genotype.h5", "r")
         ybatch = self.eval_subjects["labels"].iloc[idx * self.batch_size:((idx + 1) * self.batch_size)]
         xcov = self.eval_subjects.filter(like="cov_").iloc[idx * self.batch_size:((idx + 1) * self.batch_size)]
         xcov = xcov.values
         xbatchid = np.array(self.eval_subjects["genotype_row"].iloc[idx * self.batch_size:((idx + 1) * self.batch_size)],
-                            dtype=np.int64)
+                            dtype=int)
         xbatch = genotype_hdf.root.data[xbatchid, :]  
         xbatch = self.if_one_hot(xbatch)
         ybatch = np.reshape(np.array(ybatch), (-1, 1))
         genotype_hdf.close()
         return [xbatch, xcov], ybatch
 
-
     def multi_genotype_matrix(self, idx):      
         subjects_current_batch = self.eval_subjects.iloc[idx * self.batch_size:((idx + 1) * self.batch_size)]
         subjects_current_batch["batch_index"] = np.arange(subjects_current_batch.shape[0])
@@ -276,7 +262,7 @@ def multi_genotype_matrix(self, idx):
         for i in subjects_current_batch["chunk_id"].unique():
             genotype_hdf = tables.open_file(self.genotype_path + "/" + str(i) + self.h5filenames + ".h5", "r")
             subjects_current_chunk = subjects_current_batch[subjects_current_batch["chunk_id"] == i]
-            xbatchid = np.array(subjects_current_chunk["genotype_row"].values, dtype=np.int64)
+            xbatchid = np.array(subjects_current_chunk["genotype_row"].values, dtype=int)
             xbatch[subjects_current_chunk["batch_index"].values, :] = genotype_hdf.root.data[xbatchid, :]
             genotype_hdf.close()
 
@@ -286,5 +272,22 @@ def multi_genotype_matrix(self, idx):
         return [xbatch, xcov], ybatch
 
 
+    def get_data(self, sample_pat=0):
+
+        genotype_hdf = tables.open_file(self.genotype_path + "/genotype.h5", "r")
+        ybatch = self.eval_subjects["labels"]
+
+        if sample_pat > 0:
+            self.eval_subjects = self.eval_subjects.sample(n=sample_pat, random_state=1)
+        
+        xbatchid = np.array(self.eval_subjects["genotype_row"].values, dtype=int)
+            
+        xcov = self.eval_subjects.filter(like="cov_")
+        xcov = xcov.values
+        xbatch = genotype_hdf.root.data[xbatchid,...]  
+        xbatch = self.if_one_hot(xbatch)
+        ybatch = np.reshape(np.array(ybatch), (-1, 1))
+        genotype_hdf.close()
+        return [xbatch, xcov], ybatch