ensc813-project/preprocess.py at master · bretthannigan/ensc813-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys
import csv

import numpy as np
import pickle
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
import scipy.sparse as sp

import ReactionGraph as rg

def import_patent_dataset(data_file="data/1976_Sep2016_USPTOgrants_smiles.rsmi", num_reactions=np.Inf, max_num_atoms=128):

    with open(data_file, newline='') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        next(csv_reader)
        reactant = []
        product = []
        try_count = 0
        success_count = 0
        fail_count = 0
        is_error_on_iter = False
        for row in csv_reader:
            fail_count += is_error_on_iter
            is_error_on_iter = False
            try_count += 1
            row = row[0].split(None, 1)[0]
            # Reaction SMILES format: reactant_1.reactant_2>reagent>product_1.product_2
            reac, _, prod = row.split(">") # Not interested in reagents/catalysts.
            reac = Chem.MolFromSmiles(reac)
            try:
                Chem.SanitizeMol(reac)
                num_reactant_atoms = Chem.Mol.GetNumAtoms(reac)
            except:
                is_error_on_iter = True
                print("\tSkipping reaction on line {}, error in reactant SMILES.".format(try_count + 1))
                continue
            prod = Chem.MolFromSmiles(prod)
            try:
                Chem.SanitizeMol(prod)
                num_product_atoms = Chem.Mol.GetNumAtoms(prod)
            except:
                is_error_on_iter = True
                print("\tSkipping reaction on line {}, error in product SMILES".format(try_count + 1))
                continue
            if num_reactant_atoms>max_num_atoms or num_product_atoms>max_num_atoms:
                is_error_on_iter = True
                print("\tSkipping reaction on line {}, has greater than {} atoms.".format(try_count + 1, max_num_atoms))
                continue
            reactant.append(rg.ReactionSideGraph.from_rdMol([reac], max_num_atoms))
            product.append(rg.ReactionSideGraph.from_rdMol([prod], max_num_atoms))
            success_count += 1
            if success_count >= num_reactions:
                break
        print("Attempted import of {} reactions with {} successes and {} skips.".format(try_count, success_count, fail_count))

    with open("/data/LoweUSPTOGrants_1976-2016_{}Atoms_{}Reactions.pickle".format(max_num_atoms, success_count), 'wb') as f:
        pickle.dump(reactant, f)
        pickle.dump(product, f)

def import_logp_dataset(data_file="data/logP_dataset.csv", num_compounds=100000, max_num_atoms=32):
    with open(data_file) as f:
        csv_reader = csv.reader(f, delimiter=',')
        compound = []
        log_p = []
        try_count = 0
        success_count = 0
        fail_count = 0
        for row in csv_reader:
            try_count += 1
            new_compound = Chem.MolFromSmiles(row[0])
            try:
                Chem.SanitizeMol(new_compound)
            except:
                fail_count += 1
                print("\tSkipping compound on line {}, error in SMILES".format(try_count))
                continue
            compound.append(rg.ReactionSideGraph.from_rdMol([new_compound], max_num_atoms))
            log_p.append(float(row[1]))
            success_count += 1
    print("Attempted import of {} reactions with {} successes and {} skips.".format(try_count, success_count, fail_count))

    with open("KagglelogP.pickle", 'wb') as f:
        pickle.dump(compound, f)
        pickle.dump(log_p, f)

import_logp_dataset()