CoTox/CoTox_iupac_gpt_4o.py at main · dmis-lab/CoTox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import json, gzip, os, pickle, time, math
from datetime import date
from tqdm import tqdm
from openai import OpenAI
from liquid import Template
import argparse
import pandas as pd
from metric import *

def parse_args():
    parser = argparse.ArgumentParser(description="Argument parser example")
    parser.add_argument('--test_name', type=str, default='test_1')
    parser.add_argument('--gpu', type=str, default='0')
    parser.add_argument('--error', type=bool, default=False)
    return parser.parse_args()

args = parse_args()
test_name = args.test_name
unitox_df = pd.read_csv(f'./CTD/Unitox_CTD_Drug_{test_name}.csv')

toxicity_columns = [
    col for col in unitox_df.columns
    if col.endswith("_binary_rating_0_1")
    and not any(x in col for x in ['dermatological', 'ototoxicity'])
]

true_lst = unitox_df[toxicity_columns].fillna(0).astype(int).values.tolist()
toxicity_types = [
        "Cardiotoxicity",
        "Hematological Toxicity",
        "Infertility",
        "Liver Toxicity",
        "Pulmonary Toxicity",
        "Renal Toxicity"
    ]
saved_name = f"cotox_iupac_gpt_4o_{test_name}"

client = "Your_API_KEY"

system_prompt = '''You are an expert in cheminformatics and toxicology. Your task is to predict toxicity for small molecules using:

1. Pathway involvement in toxicity mechanisms.
2. GO terms' biological implications.
3. IUPAC name-based structural interpretation to support and explain toxicity mechanisms.

Your response must be strictly in JSON format. **Do not include any explanation, text, or information outside the JSON object.** The JSON format is as follows:'''

user_prompt = Template("""
Predict toxicity for each type ("Toxic" or "Non-Toxic") based on the provided **organ-specific** Pathways and GO Terms. Use IUPAC name analysis only to support the evidence from Pathway and GO Term analyses.
Finally, provide a step-by-step explanation of the overall mechanism combining evidence from Pathways, GO Terms, and the IUPAC name to describe how the compound causes toxicity in the body.

### IUPAC Name:
{{iupac_name}}

### List of Pathway:
{{pathway_lst}}

### List of GO Term:
{{GO_lst}}

### Required Response:
- Return the response strictly in the JSON format below.
- Do not include any additional text, explanation, or comments outside the JSON.

```json
{
    "Toxicity Predictions": {
        "Cardiotoxicity": {
            "Reasoning": [
                "Pathway: Explanation of pathway involvement in cardiotoxicity and the biological processes it triggers.",
                "GO Term: Explanation of biological outcomes linked to cardiotoxicity based on GO Term analysis.",
                "IUPAC Support: Explanation of how structural features inferred from the IUPAC name contribute to the biological processes and pathways leading to cardiotoxicity.",
                "Overall Mechanism: Combined explanation of how the compound causes cardiotoxicity in the body."
            ],
            "Prediction": "Toxic" or "Non-Toxic"
        },
        "Hematological Toxicity": {
            "Reasoning": [
                "Pathway: Explanation of pathway involvement in hematological toxicity and the biological processes it triggers.",
                "GO Term: Explanation of biological outcomes linked to hematological toxicity based on GO Term analysis.",
                "IUPAC Support: Explanation of how structural features inferred from the IUPAC name contribute to the biological processes and pathways leading to hematological toxicity.",
                "Overall Mechanism: Combined explanation of how the compound causes hematological toxicity in the body."
            ],
            "Prediction": "Toxic" or "Non-Toxic"
        },
        "Infertility": {
            "Reasoning": [
                "Pathway: Explanation of pathway involvement in infertility and the biological processes it triggers.",
                "GO Term: Explanation of biological outcomes linked to infertility based on GO Term analysis.",
                "IUPAC Support: Explanation of how structural features inferred from the IUPAC name contribute to the biological processes and pathways leading to infertility.",
                "Overall Mechanism: Combined explanation of how the compound causes infertility in the body."
            ],
            "Prediction": "Toxic" or "Non-Toxic"
        },
        "Liver Toxicity": {
            "Reasoning": [
                "Pathway: Explanation of pathway involvement in liver toxicity and the biological processes it triggers.",
                "GO Term: Explanation of biological outcomes linked to liver toxicity based on GO Term analysis.",
                "IUPAC Support: Explanation of how structural features inferred from the IUPAC name contribute to the biological processes and pathways leading to liver toxicity.",
                "Overall Mechanism: Combined explanation of how the compound causes liver toxicity in the body."
            ],
            "Prediction": "Toxic" or "Non-Toxic"
        },
        "Pulmonary Toxicity": {
            "Reasoning": [
                "Pathway: Explanation of pathway involvement in pulmonary toxicity and the biological processes it triggers.",
                "GO Term: Explanation of biological outcomes linked to pulmonary toxicity based on GO Term analysis.",
                "IUPAC Support: Explanation of how structural features inferred from the IUPAC name contribute to the biological processes and pathways leading to pulmonary toxicity.",
                "Overall Mechanism: Combined explanation of how the compound causes pulmonary toxicity in the body."
            ],
            "Prediction": "Toxic" or "Non-Toxic"
        },
        "Renal Toxicity": {
            "Reasoning": [
                "Pathway: Explanation of pathway involvement in renal toxicity and the biological processes it triggers.",
                "GO Term: Explanation of biological outcomes linked to renal toxicity based on GO Term analysis.",
                "IUPAC Support: Explanation of how structural features inferred from the IUPAC name contribute to the biological processes and pathways leading to renal toxicity.",
                "Overall Mechanism: Combined explanation of how the compound causes renal toxicity in the body."
            ],
            "Prediction": "Toxic" or "Non-Toxic"
        }
    }
}
```""")


def tox_summary(idx, drug_name, iupac_name, pathway_lst, go_lst, max_retries=3):
    attempt = 0
    while attempt < max_retries:
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": system_prompt
                    },
                    {
                        "role": "user",
                        "content": user_prompt.render(
                            iupac_name=iupac_name,
                            pathway_lst=pathway_lst,
                            GO_lst=go_lst

                        )
                    }
                ],
                model="gpt-4o",
                logprobs=True,
                temperature=0.0,
                seed=42
            )

            content = chat_completion.choices[0].message.content
            if "```json" in content and "```" in content:
                start = content.index("```json") + len("```json")
                end = content.rindex("```")
                content = content[start:end].strip()

            toxicity_data = json.loads(content)
            final_answer_list = []
            for toxicity_type in toxicity_types:
                pred_str = toxicity_data["Toxicity Predictions"][toxicity_type]["Prediction"]
                if pred_str == "Toxic":
                    final_answer_list.append(1)
                else:
                    final_answer_list.append(0)
            toxicity_data["Final_Answer_List"] = final_answer_list
            toxicity_data["True_Answer_List"] = true_lst[idx]

            os.makedirs(f"./results/{saved_name}", exist_ok=True)

            with open(f"./results/{saved_name}/{idx}_{drug_name}.json", "w") as json_file:
                json.dump(toxicity_data, json_file, indent=4)

            return toxicity_data['Final_Answer_List']

        except json.JSONDecodeError:
            print(f"Attempt {attempt + 1}: Unable to parse JSON content. Retrying...")
            attempt += 1

        except Exception as e:
            print(f"Attempt {attempt + 1}: An unexpected error occurred: {e}")
            attempt += 1

    print("Error: All attempts to process the response failed.")
    return None


file_path = f"Unitox_CTD_Drug_{test_name}.json"
with open(file_path, "r") as file:
    data = json.load(file)

if __name__ == "__main__":
    print(f"Processing {saved_name}")
    preds_lst = []
    for i in tqdm(range(len(unitox_df))):
        Chemical_name = unitox_df['generic_name'].tolist()[i].lower()
        smiles = unitox_df['smiles'].tolist()[i]
        key_name = f"{i}_{Chemical_name}"
        iupac_name = data[key_name]['iupac_name']
        path_lst = list(set(data[key_name]['pathways']))
        GO_lst = list(set(data[key_name]['go_terms']))

        tox_pred = tox_summary(i, Chemical_name, iupac_name, path_lst, GO_lst)

        preds_lst.append(tox_pred)

    metric_results_df= evaluate_metrics(true_lst, preds_lst)

    output_file = f"./results/{saved_name}/scores_output.csv"
    metric_results_df.to_csv(output_file)

    print(f"{saved_name} Scores Saved!!")