1+ import os
2+ import csv
3+ from sqlalchemy import create_engine
4+ from sqlalchemy import text
5+ from constants import Constants
6+ from utils import Utils
7+
8+ PROTEIN_GENE_HEADER = f'Gene ID\t Display Gene ID\t Species\t Taxon ID'
9+ GRN_GENE_HEADER = f'Gene ID\t Display Gene ID\t Species\t Taxon ID\t Regulator'
10+
11+ def _get_all_data_from_database_table (database_namespace , table_name ):
12+ db = create_engine (os .environ ['DB_URL' ])
13+ with db .connect () as connection :
14+ result_set = connection .execute (text (f"SELECT * FROM { database_namespace } .{ table_name } " ))
15+ return result_set .fetchall ()
16+
17+ def _get_all_db_genes (database_namespace ):
18+ gene_records = _get_all_data_from_database_table (database_namespace , "gene" )
19+ genes = {}
20+ for gene in gene_records :
21+ key = (gene [0 ], gene [3 ])
22+ if len (gene ) > 4 :
23+ value = (gene [1 ], gene [2 ], gene [4 ])
24+ else :
25+ value = (gene [1 ], gene [2 ])
26+ genes [key ] = value
27+ return genes
28+
29+ def _get_all_db_grn_genes ():
30+ return _get_all_db_genes (Constants .GRN_DATABASE_NAMESPACE )
31+
32+ def _get_all_db_ppi_genes ():
33+ return _get_all_db_genes (Constants .PPI_DATABASE_NAMESPACE )
34+
35+ def _get_all_genes ():
36+ db_grn_genes = _get_all_db_grn_genes ()
37+ db_ppi_genes = _get_all_db_ppi_genes ()
38+
39+ if not os .path .exists ('union-gene-data' ):
40+ os .makedirs ('union-gene-data' )
41+ Utils .create_union_file ([Constants .PPI_GENE_SOURCE , Constants .GRN_GENE_SOURCE ], Constants .GENE_DATA_DIRECTORY )
42+ genes = db_grn_genes
43+
44+ for gene in db_ppi_genes :
45+ if gene not in genes :
46+ display_gene_id , species = db_ppi_genes [gene ]
47+ genes [gene ] = [display_gene_id , species , False ]
48+
49+ with open (Constants .GENE_DATA_DIRECTORY , 'r+' , encoding = "UTF-8" ) as f :
50+ i = 0
51+ reader = csv .reader (f )
52+ for row in reader :
53+ if i != 0 :
54+ row = row [0 ].split ('\t ' )
55+ gene_id = row [0 ]
56+ display_gene_id = row [1 ]
57+ species = row [2 ]
58+ taxon_id = row [3 ]
59+ regulator = row [4 ].capitalize ()
60+ key = (gene_id , taxon_id )
61+ value = (display_gene_id , species , regulator )
62+ if key not in genes :
63+ genes [key ] = value
64+ elif genes [key ][0 ] != display_gene_id :
65+ if display_gene_id != "None" :
66+ genes [key ] = value
67+ i += 1
68+ return genes
69+
70+
71+ def get_all_proteins ():
72+ protein_records = _get_all_data_from_database_table (Constants .PPI_DATABASE_NAMESPACE , "protein" )
73+ proteins = {}
74+ for protein in protein_records :
75+ key = (protein [0 ], protein [5 ])
76+ value = (protein [1 ], protein [2 ], protein [3 ], protein [4 ])
77+ proteins [key ] = value
78+ return proteins
79+
80+ def processing_grn_gene_file ():
81+ return _processing_gene_file (_get_all_db_grn_genes (), is_protein = False )
82+
83+ def processing_ppi_gene_file ():
84+ return _processing_gene_file (_get_all_db_ppi_genes ())
85+
86+ def _processing_gene_file (db_genes , is_protein = True ):
87+ print (f'Processing gene' )
88+ missing_genes = {}
89+ genes_to_update = {}
90+ all_genes = _get_all_genes ()
91+ for gene in all_genes :
92+ display_gene_id , species , regulator = all_genes [gene ]
93+ values_for_ppi = (display_gene_id , species )
94+ values_for_grn = (display_gene_id , species , regulator )
95+ if gene not in db_genes :
96+ if is_protein :
97+ missing_genes [gene ] = values_for_ppi
98+ else :
99+ missing_genes [gene ] = values_for_grn
100+ elif gene in db_genes and db_genes [gene ][0 ] != display_gene_id :
101+ if db_genes [gene ][0 ] != "None" :
102+ if is_protein :
103+ genes_to_update [gene ] = values_for_ppi
104+ else :
105+ genes_to_update [gene ] = values_for_grn
106+ return missing_genes , genes_to_update
107+
108+ def processing_protein_file (file_path , db_proteins ):
109+ print (f'Processing file { file_path } ' )
110+ ppi_missing_proteins = {}
111+ ppi_proteins_to_update = {}
112+ with open (file_path , 'r+' , encoding = "UTF-8" ) as f :
113+ i = 0
114+ reader = csv .reader (f )
115+ for row in reader :
116+ if i != 0 :
117+ row = row [0 ].split ('\t ' )
118+ standard_name = row [0 ]
119+ gene_systematic_name = row [1 ]
120+ length = float (row [2 ]) if row [2 ] != "None" else 0
121+ molecular_weight = float (row [3 ]) if row [3 ] != "None" else 0
122+ pi = float (row [4 ]) if row [4 ] != "None" else 0
123+ taxon_id = row [5 ]
124+ key = (standard_name , taxon_id )
125+ value = (gene_systematic_name , length , molecular_weight , pi )
126+ if key not in db_proteins :
127+ ppi_missing_proteins [key ] = value
128+ elif db_proteins [key ] != value :
129+ ppi_proteins_to_update [key ] = value
130+ i += 1
131+ return ppi_missing_proteins , ppi_proteins_to_update
132+
133+ def create_grn_gene_file (file_path , data ):
134+ _create_gene_file (file_path , GRN_GENE_HEADER , data , is_protein = False )
135+
136+ def create_ppi_gene_file (file_path , data ):
137+ _create_gene_file (file_path , PROTEIN_GENE_HEADER , data )
138+
139+ def _create_gene_file (file_path , headers , data , is_protein = True ):
140+ print (f'Creating { file_path } \n ' )
141+ gene_file = open (file_path , 'w' )
142+ gene_file .write (f'{ headers } \n ' )
143+ for gene in data :
144+ if is_protein :
145+ gene_file .write (f'{ gene [0 ]} \t { data [gene ][0 ]} \t { data [gene ][1 ]} \t { gene [1 ]} \n ' )
146+ else :
147+ gene_file .write (f'{ gene [0 ]} \t { data [gene ][0 ]} \t { data [gene ][1 ]} \t { gene [1 ]} \t { data [gene ][2 ]} \n ' )
148+ gene_file .close ()
149+
150+ def create_ppi_protein_file (file_path , data ):
151+ print (f'Creating { file_path } \n ' )
152+ protein_file = open (file_path , 'w' )
153+ headers = f'Standard Name\t Gene Systematic Name\t Length\t Molecular Weight\t PI\t Taxon ID'
154+ protein_file .write (f'{ headers } \n ' )
155+ for protein in data :
156+ protein_file .write (f'{ protein [0 ]} \t { data [protein ][0 ]} \t { data [protein ][1 ]} \t { data [protein ][2 ]} \t { data [protein ][3 ]} \t { protein [1 ]} \n ' )
157+ protein_file .close ()
158+
159+ # Processing gene files
160+ ppi_missing_genes , ppi_genes_to_update = processing_ppi_gene_file ()
161+ grn_missing_genes , grn_genes_to_update = processing_grn_gene_file ()
162+ ppi_missing_proteins , ppi_proteins_to_update = processing_protein_file (Constants .PPI_PROTEIN_TABLE_DATA_DIRECTORY , get_all_proteins ())
163+ create_grn_gene_file (Constants .GRN_MISSING_GENE_DIRECTORY , grn_missing_genes )
164+ create_grn_gene_file (Constants .GRN_UPDATE_GENE_DIRECTORY , grn_genes_to_update )
165+ create_ppi_gene_file (Constants .PPI_MISSING_GENE_DIRECTORY , ppi_missing_genes )
166+ create_ppi_gene_file (Constants .PPI_UPDATE_GENE_DIRECTORY , ppi_genes_to_update )
167+ create_ppi_protein_file (Constants .PPI_MISSING_PROTEIN_DIRECTORY , ppi_missing_proteins )
168+ create_ppi_protein_file (Constants .PPI_UPDATE_PROTEIN_DIRECTORY , ppi_proteins_to_update )
0 commit comments