-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilters.py
More file actions
138 lines (110 loc) · 5.2 KB
/
filters.py
File metadata and controls
138 lines (110 loc) · 5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import utils
def apply_tissue_filter(config_file, valid_proteins, cutoff):
hosts = utils.read_config(filepath=config_file, field='hosts')
tissue_mapping = utils.read_config(filepath=config_file, field='tissues')
for taxid in hosts:
proteins = valid_proteins[taxid]
if 'tissues_url' in hosts[taxid]:
url = hosts[taxid]['tissues_url']
filename = utils.download_file(url=url, data_dir='data')
tissues, proteins = get_tissues(config_file, filename, proteins, cutoff, tissue_mapping)
valid_proteins[taxid] = proteins
return tissues
def get_tissues(config_file, tissues_file, valid_proteins, cutoff, mapping):
"""
Get protein tissue expression for relevant tissues in the lifecycle of the
studied parasites
:param str config_file: path to the configuration file
:param str tissues_file: path to file with tissue expression (tissues.jensenlab.org)
:param list valid_proteins: all proteins studied
:param float cutoff: minimum confidence score accepted (tissues.jensenlab.org)
:return tissues: dictionary with protein tissue expression
"""
tissues = {}
filters = {}
valid_tissues = set()
parasites = utils.read_config(filepath=config_file, field='parasites')
for parasite in parasites:
t = parasites[parasite]['tissues']
valid_tissues.update(t)
first = True
with open(tissues_file, 'r') as f:
for line in f:
if first:
first = False
continue
data = line.rstrip().split('\t')
protein = "9606."+data[0]
tissue = data[2]
score = float(data[6])
if protein in valid_proteins and score >= cutoff and tissue in valid_tissues:
if protein not in tissues:
tissues[protein] = []
tissues[protein].append(mapping[tissue])
filters[protein] = valid_proteins[protein]
return tissues, filters
def apply_compartment_filter(config_file, valid_proteins, cutoff):
hosts = utils.read_config(filepath=config_file, field='hosts')
for taxid in hosts:
proteins = valid_proteins[taxid]
#print("C before", len(proteins))
if 'compartments_url' in hosts[taxid]:
url = hosts[taxid]['compartments_url']
filename = utils.download_file(url=url, data_dir='data')
compartments, proteins = get_compartments(config_file, filename, proteins, cutoff)
valid_proteins[taxid] = proteins
#print("C after", len(valid_proteins[taxid]))
return compartments
def get_compartments(config_file, compartments_file, valid_proteins, cutoff):
"""
Get protein cellular compartment expression relevant in the lifecycle of the
studied parasites
:param str config_file: path to the configuration file
:param str compartments_file: path to file with cellular compartment expression (compartments.jensenlab.org)
:param dict valid_proteins: dictionary with annotations in valid proteins
:param float cutoff: minimum confidence score accepted (tissues.jensenlab.org)
:return filtered_dict: dictionary with only proteins in relevant compartments
"""
compartments = {}
filters = {}
valid_compartments = set()
parasites = utils.read_config(filepath=config_file, field='parasites')
for parasite in parasites:
if "compartments" in parasites[parasite]:
t = parasites[parasite]['compartments']
else:
t = 'GO:0005886'
valid_compartments.add(t)
first = True
with open(compartments_file, 'r') as f:
for line in f:
if first:
first = False
continue
data = line.rstrip().split('\t')
protein = "9606."+data[0]
compartment = data[2]
score = float(data[4])
if protein in valid_proteins and score >= cutoff and compartment in valid_compartments:
if protein not in compartments:
compartments[protein] = []
compartments[protein].append(compartment)
filters[protein] = valid_proteins[protein]
return compartments, filters
def get_secretome_predictions(config_file, secretome_dir, valid_proteins):
"""
Filter out proteins that are not secreted or membrane from the list of parasite proteins
:param str config_file: path to the configuration file
:param str secretome_dir: path to the directory where the prediction files are
:param dict valid_proteins: dictionary with annotations in valid proteins
:return filtered_dict: dictionary with only secreted or membrane parasite proteins
"""
parasites = utils.read_config(filepath=config_file, field='parasites')
for parasite in parasites:
filepath = os.path.join(secretome_dir, str(parasite)+'.fasta')
sequences = utils.read_fasta(filepath)
filter_out_ids = utils.filter_sequences(sequences, valid_proteins[parasite])
for k in filter_out_ids:
valid_proteins[parasite].pop(k, None)
return valid_proteins