-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess.py
More file actions
87 lines (79 loc) · 3.45 KB
/
Copy pathpreprocess.py
File metadata and controls
87 lines (79 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys
import csv
import numpy as np
import pickle
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
import scipy.sparse as sp
import ReactionGraph as rg
def import_patent_dataset(data_file="data/1976_Sep2016_USPTOgrants_smiles.rsmi", num_reactions=np.Inf, max_num_atoms=128):
with open(data_file, newline='') as f:
csv_reader = csv.reader(f, delimiter='\t')
next(csv_reader)
reactant = []
product = []
try_count = 0
success_count = 0
fail_count = 0
is_error_on_iter = False
for row in csv_reader:
fail_count += is_error_on_iter
is_error_on_iter = False
try_count += 1
row = row[0].split(None, 1)[0]
# Reaction SMILES format: reactant_1.reactant_2>reagent>product_1.product_2
reac, _, prod = row.split(">") # Not interested in reagents/catalysts.
reac = Chem.MolFromSmiles(reac)
try:
Chem.SanitizeMol(reac)
num_reactant_atoms = Chem.Mol.GetNumAtoms(reac)
except:
is_error_on_iter = True
print("\tSkipping reaction on line {}, error in reactant SMILES.".format(try_count + 1))
continue
prod = Chem.MolFromSmiles(prod)
try:
Chem.SanitizeMol(prod)
num_product_atoms = Chem.Mol.GetNumAtoms(prod)
except:
is_error_on_iter = True
print("\tSkipping reaction on line {}, error in product SMILES".format(try_count + 1))
continue
if num_reactant_atoms>max_num_atoms or num_product_atoms>max_num_atoms:
is_error_on_iter = True
print("\tSkipping reaction on line {}, has greater than {} atoms.".format(try_count + 1, max_num_atoms))
continue
reactant.append(rg.ReactionSideGraph.from_rdMol([reac], max_num_atoms))
product.append(rg.ReactionSideGraph.from_rdMol([prod], max_num_atoms))
success_count += 1
if success_count >= num_reactions:
break
print("Attempted import of {} reactions with {} successes and {} skips.".format(try_count, success_count, fail_count))
with open("/data/LoweUSPTOGrants_1976-2016_{}Atoms_{}Reactions.pickle".format(max_num_atoms, success_count), 'wb') as f:
pickle.dump(reactant, f)
pickle.dump(product, f)
def import_logp_dataset(data_file="data/logP_dataset.csv", num_compounds=100000, max_num_atoms=32):
with open(data_file) as f:
csv_reader = csv.reader(f, delimiter=',')
compound = []
log_p = []
try_count = 0
success_count = 0
fail_count = 0
for row in csv_reader:
try_count += 1
new_compound = Chem.MolFromSmiles(row[0])
try:
Chem.SanitizeMol(new_compound)
except:
fail_count += 1
print("\tSkipping compound on line {}, error in SMILES".format(try_count))
continue
compound.append(rg.ReactionSideGraph.from_rdMol([new_compound], max_num_atoms))
log_p.append(float(row[1]))
success_count += 1
print("Attempted import of {} reactions with {} successes and {} skips.".format(try_count, success_count, fail_count))
with open("KagglelogP.pickle", 'wb') as f:
pickle.dump(compound, f)
pickle.dump(log_p, f)
import_logp_dataset()