Skip to content

Commit 1f2a2a6

Browse files
authored
Add files via upload
1 parent cd25e2f commit 1f2a2a6

1 file changed

Lines changed: 202 additions & 0 deletions

File tree

scripts/filter.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
"""
2+
Filtering GDB-13
3+
Authors: Robert Pollice, Akshat Nigam
4+
Date: Sep. 2020
5+
"""
6+
7+
# SETTING UP PYTON ENVIRONMENT
8+
# Load modules
9+
import numpy as np
10+
import pandas as pd
11+
import rdkit as rd
12+
import rdkit.Chem as rdc
13+
import rdkit.Chem.Descriptors as rdcd
14+
import rdkit.Chem.rdMolDescriptors as rdcmd
15+
import rdkit.Chem.Lipinski as rdcl
16+
import rdkit.Chem.rdmolops as rdcmo
17+
import argparse as ap
18+
import pathlib as pl
19+
cwd = pl.Path.cwd() # define current working directory
20+
21+
def smiles_to_mol(smiles):
22+
"""
23+
Convert SMILES to mol object using RDKit
24+
"""
25+
try:
26+
mol = rdc.MolFromSmiles(smiles)
27+
except:
28+
mol = None
29+
return mol
30+
31+
def maximum_ring_size(mol):
32+
"""
33+
Calculate maximum ring size of molecule
34+
"""
35+
cycles = mol.GetRingInfo().AtomRings()
36+
if len(cycles) == 0:
37+
maximum_ring_size = 0
38+
else:
39+
maximum_ring_size = max([len(ci) for ci in cycles])
40+
return maximum_ring_size
41+
42+
def minimum_ring_size(mol):
43+
"""
44+
Calculate minimum ring size of molecule
45+
"""
46+
cycles = mol.GetRingInfo().AtomRings()
47+
if len(cycles) == 0:
48+
minimum_ring_size = 0
49+
else:
50+
minimum_ring_size = min([len(ci) for ci in cycles])
51+
return minimum_ring_size
52+
53+
def substructure_violations(mol):
54+
"""
55+
Check for substructure violates
56+
Return True: contains a substructure violation
57+
Return False: No substructure violation
58+
"""
59+
violation = False
60+
# filter used for GDB13 filtering
61+
forbidden_fragments = ['[Cl,Br,I]', '*=*=*', '*#*', '[O,o,S,s]~[O,o,S,s]', '[N,n,O,o,S,s]~[N,n,O,o,S,s]~[N,n,O,o,S,s]', '[C,c]~N=,:[O,o,S,s;!R]', '[N,n,O,o,S,s]~[N,n,O,o,S,s]~[C,c]=,:[O,o,S,s,N,n;!R]', '*=[NH]', '*=N-[*;!R]', '*~[N,n,O,o,S,s]-[N,n,O,o,S,s;!R]', '*-[CH1]-*', '*-[CH2]-*', '*-[CH3]']
62+
63+
for ni in range(len(forbidden_fragments)):
64+
65+
if mol.HasSubstructMatch(rdc.MolFromSmarts(forbidden_fragments[ni])) == True:
66+
violation = True
67+
break
68+
else:
69+
continue
70+
71+
return violation
72+
73+
def aromaticity_degree(mol):
74+
"""
75+
Compute the percentage of non-hydrogen atoms in a molecule that are aromatic
76+
"""
77+
atoms = mol.GetAtoms()
78+
atom_number = rdcl.HeavyAtomCount(mol)
79+
aromaticity_count = 0.
80+
81+
for ai in atoms:
82+
if ai.GetAtomicNum() != 1:
83+
if ai.GetIsAromatic() == True:
84+
aromaticity_count += 1.
85+
86+
degree = aromaticity_count / atom_number
87+
88+
return degree
89+
90+
def conjugation_degree(mol):
91+
"""
92+
Compute the percentage of bonds between non-hydrogen atoms in a molecule that are conjugated
93+
"""
94+
bonds = mol.GetBonds()
95+
bond_number = 0.
96+
conjugation_count = 0.
97+
98+
for bi in bonds:
99+
a1 = bi.GetBeginAtom()
100+
a2 = bi.GetEndAtom()
101+
if (a1.GetAtomicNum() != 1) and (a2.GetAtomicNum() != 1):
102+
bond_number += 1.
103+
if bi.GetIsConjugated() == True:
104+
conjugation_count += 1.
105+
106+
degree = conjugation_count / bond_number
107+
108+
return degree
109+
110+
def main():
111+
# parse arguments
112+
parser = ap.ArgumentParser()
113+
parser.add_argument('data', help="CSV file of SMILES. Delimiter should be space and it should have no header.", type=str)
114+
parser.add_argument('--removed', help="Wheter or not to save list of removed SMILES. '0', do not save, or '1', save. Default is '0'.", type=str, choices=['0', '1'], default='0')
115+
arguments = parser.parse_args()
116+
117+
# Generate output name
118+
output = arguments.data.split('.')
119+
output[-2] += '_filtered'
120+
output = '.'.join(output)
121+
output_removed = arguments.data.split('.')
122+
output_removed[-2] += '_removed'
123+
output_removed = '.'.join(output_removed)
124+
125+
# Load data
126+
data = pd.read_csv(arguments.data, delimiter=' ', usecols=[0, 1], names=['SMILES','NUMBER'], skiprows=0)
127+
print('Original Data: ' + str(len(data.index)))
128+
129+
# Generate mol objects from smiles and remove compounds that returned erros
130+
data['MOL'] = data['SMILES'].apply(smiles_to_mol)
131+
new_data = data[data['MOL'] != None]
132+
data = new_data.copy()
133+
data['CSMILES'] = data['MOL'].apply(lambda x: rdc.MolToSmiles(x, isomericSmiles=False, canonical=True))
134+
print('RDKit Converted: ' + str(len(data.index)))
135+
136+
# Added after GDB-13 was filtered to get rid charged molecules
137+
data['CHARGE'] = data['MOL'].apply(lambda x: rdcmo.GetFormalCharge(x))
138+
new_data = data[(data['CHARGE'] == 0)]
139+
data = new_data.copy()
140+
print('Filtered by molecular charge: ' + str(len(data.index)))
141+
142+
# Added after GDB-13 was filtered to get rid radicals
143+
data['RADICALS'] = data['MOL'].apply(lambda x: rdcd.NumRadicalElectrons(x))
144+
new_data = data[(data['RADICALS'] == 0)]
145+
data = new_data.copy()
146+
print('Filtered by number of radicals: ' + str(len(data.index)))
147+
148+
# Filter by bridgehead atoms
149+
# Note: filters are ordered by increasing timing requirements
150+
data['BRIDGEHEAD'] = data['MOL'].apply(lambda x: rdcmd.CalcNumBridgeheadAtoms(x))
151+
new_data = data[data['BRIDGEHEAD'] == 0]
152+
data = new_data.copy()
153+
print('Filtered by bridgehead atoms: ' + str(len(data.index)))
154+
155+
# Filter by spiro atoms
156+
data['SPIROATOMS'] = data['MOL'].apply(lambda x: rdcmd.CalcNumSpiroAtoms(x))
157+
new_data = data[data['SPIROATOMS'] == 0]
158+
data = new_data.copy()
159+
print('Filtered by spiro atoms: ' + str(len(data.index)))
160+
161+
# Filter by aromaticity
162+
data['AROMATICITY'] = data['MOL'].apply(lambda x: aromaticity_degree(x))
163+
new_data = data[data['AROMATICITY'] >= 0.50]
164+
data = new_data.copy()
165+
print('Filtered by aromaticity: ' + str(len(data.index)))
166+
167+
# Filter by conjugation
168+
data['CONJUGATION'] = data['MOL'].apply(lambda x: conjugation_degree(x))
169+
new_data = data[data['CONJUGATION'] >= 0.70]
170+
data = new_data.copy()
171+
print('Filtered by conjugation: ' + str(len(data.index)))
172+
173+
# Filter by ring size
174+
data['MAXIMUM_RINGSIZE'] = data['MOL'].apply(lambda x: maximum_ring_size(x))
175+
new_data = data[(data['MAXIMUM_RINGSIZE'] >= 5) & (data['MAXIMUM_RINGSIZE'] <= 7)]
176+
data = new_data.copy()
177+
print('Filtered by maximum ring size: ' + str(len(data.index)))
178+
179+
# Added after GDB-13 was filtered to get rid of 3-membered rings
180+
data['MINIMUM_RINGSIZE'] = data['MOL'].apply(lambda x: minimum_ring_size(x))
181+
new_data = data[(data['MINIMUM_RINGSIZE'] >= 5) & (data['MINIMUM_RINGSIZE'] <= 7)]
182+
data = new_data.copy()
183+
print('Filtered by minimum ring size: ' + str(len(data.index)))
184+
185+
# Filter by functional groups
186+
data['VIOLATIONS'] = data['MOL'].apply(lambda x: substructure_violations(x))
187+
new_data = data[data['VIOLATIONS'] == False]
188+
removed_data = data[data['VIOLATIONS'] == True]
189+
data = new_data.copy()
190+
print('Filtered by functional groups: ' + str(len(data.index)))
191+
192+
# Save processed data
193+
data.to_csv(output, columns = ['SMILES','NUMBER'], sep = ' ', index = False, header = False)
194+
195+
if arguments.removed == '1':
196+
removed_data.to_csv(output_removed, columns = ['SMILES','NUMBER'], sep = ' ', index = False, header = False)
197+
198+
return
199+
200+
201+
if __name__ == "__main__":
202+
main()

0 commit comments

Comments
 (0)