|
51 | 51 | # ── Tuberculosis early drug discovery prompts ────────────────────────── |
52 | 52 |
|
53 | 53 | TB_PROMPT = ( |
54 | | - "Extract drug discovery entities from this tuberculosis (TB) research text. " |
55 | | - "This is specialized for early-stage TB drug discovery: target identification, " |
56 | | - "hit finding, fragment screening, lead optimization, and in vitro/in vivo profiling.\n\n" |
57 | | - "COMPOUNDS:\n" |
58 | | - "- First-line drugs: Isoniazid (INH), Rifampicin (RIF), Ethambutol (EMB), Pyrazinamide (PZA).\n" |
59 | | - "- New-generation: Bedaquiline (TMC207), Delamanid (OPC-67683), Pretomanid (PA-824), " |
60 | | - "Linezolid, Clofazimine, Moxifloxacin.\n" |
61 | | - "- Pipeline: BTZ043, PBTZ169 (Macozinone), SQ109, Q203 (Telacebec), TBA-7371, " |
62 | | - "GSK656, OPC-167832, SPR720, BRD-8000, Sanfetrinem, DG167, NITD-304, NITD-349.\n" |
63 | | - "- Extract compound names (generic, code names, series IDs like 'Compound 14a'), " |
64 | | - "SMILES (only if explicitly present), CAS numbers, and molecular formulas.\n\n" |
65 | | - "BIOLOGICAL TARGETS:\n" |
66 | | - "- Mycobacterial proteins are biological targets, NOT compounds. Examples: " |
67 | | - "ClpC1, DprE1, InhA, MmpL3, AtpE, QcrB, Pks13, KasA, GyrA, GyrB, MbtA, " |
68 | | - "EthA, PanC, LdtMt2, RpoB, PncA, EmbB, Ag85.\n" |
69 | | - "- Use 'target' for proteins with drug-targeting context. " |
70 | | - "Use 'gene_name' for gene loci and gene symbols. " |
71 | | - "Use 'protein_name' for proteins without drug-targeting context.\n\n" |
72 | | - "ACCESSION NUMBERS:\n" |
73 | | - "- Rv locus tags (Rv3596c, Rv3790, Rv1484, Rv0206c, Rv1305, etc.), " |
74 | | - "UniProt accessions (P9WPS1, P9WGR1, etc.), PDB codes (6CQ4, 5V3Y, etc.).\n" |
75 | | - "- Extract each accession as a separate entity.\n\n" |
76 | | - "PRODUCTS:\n" |
77 | | - "- Gene product descriptions from databases: enzyme names, protein function descriptions " |
78 | | - "(e.g., 'enoyl-ACP reductase', 'ATP synthase subunit c', " |
79 | | - "'decaprenylphosphoryl-beta-D-ribose 2-epimerase').\n\n" |
80 | | - "FUNCTIONAL CATEGORIES:\n" |
81 | | - "- Mycobacterial protein functional categories: intermediary metabolism and respiration, " |
82 | | - "cell wall and cell processes, virulence/detoxification/adaptation, " |
83 | | - "lipid metabolism, information pathways, regulatory proteins, " |
84 | | - "PE/PPE family, conserved hypotheticals.\n\n" |
85 | | - "SCREENING METHODS:\n" |
86 | | - "- Early drug discovery screening approaches: affinity-based screening, " |
87 | | - "biochemical assay, DNA encoded library (DEL), fragment screening, " |
88 | | - "hypomorph screening, whole-cell phenotypic screening, " |
89 | | - "target-based HTS, virtual screening, SPR-based screening.\n\n" |
90 | | - "DISEASES:\n" |
91 | | - "- Tuberculosis variants: TB, MDR-TB, XDR-TB, pre-XDR-TB, TDR-TB, " |
92 | | - "LTBI, active TB, pulmonary TB, extrapulmonary TB, TB meningitis, miliary TB.\n" |
93 | | - "- Capture both full names and abbreviations as separate entities.\n\n" |
94 | | - "BIOACTIVITY:\n" |
95 | | - "- MIC (against H37Rv, Erdman, CDC1551, clinical isolates, MDR/XDR strains), " |
96 | | - "MIC90, MBC, IC50, EC50, Ki. " |
97 | | - "Capture numeric value, unit (ug/mL, uM, nM), measurement type, and strain context.\n\n" |
98 | | - "ASSAYS:\n" |
99 | | - "- MABA, LORA, REMA, macrophage infection (THP-1, J774, RAW264.7), " |
100 | | - "time-kill kinetics, checkerboard synergy, mouse acute/chronic infection models, " |
101 | | - "guinea pig aerosol model.\n\n" |
102 | | - "MECHANISMS OF ACTION:\n" |
103 | | - "- Mycolic acid biosynthesis inhibition, ATP synthase inhibition, " |
104 | | - "cell wall arabinan biosynthesis disruption, menaquinone biosynthesis inhibition, " |
105 | | - "trehalose monomycolate transport inhibition, covalent modification, " |
106 | | - "DNA gyrase inhibition, decaprenylphosphoryl-beta-D-ribose oxidation.\n\n" |
| 54 | + "Extract drug discovery entities from this tuberculosis research text.\n\n" |
| 55 | + "DISAMBIGUATION RULES:\n" |
| 56 | + "- Mycobacterial proteins (e.g. ClpC1, DprE1, InhA, AtpE, MmpL3, QcrB) " |
| 57 | + "are biological targets, NOT compounds.\n" |
| 58 | + "- Rv locus tags (Rv3790, Rv1484), UniProt IDs (P9WPS1), and PDB codes " |
| 59 | + "are accession_number, not target or gene_name.\n" |
| 60 | + "- Enzyme descriptions like 'enoyl-ACP reductase' are product, not target.\n" |
| 61 | + "- 'cell wall', 'lipid metabolism' are functional_category, not mechanism_of_action.\n" |
| 62 | + "- 'fragment screening', 'biochemical assay' are screening_method, not assay.\n" |
| 63 | + "- Use target for proteins in a drug-targeting context, gene_name for loci, " |
| 64 | + "protein_name for non-drug-target proteins.\n\n" |
107 | 65 | "Extract only what is explicitly stated; do not infer or generate values." |
108 | 66 | ) |
109 | 67 |
|
110 | 68 | TB_CHEMISTRY_PROMPT = ( |
111 | 69 | "Extract chemical entities from this tuberculosis drug discovery text. " |
112 | | - "Include: compound names (generic names, IUPAC names, clinical codes like 'TMC207', " |
113 | | - "series identifiers like 'Compound 14a', brand names), " |
114 | | - "SMILES strings (only if explicitly written), CAS registry numbers, and molecular formulas. " |
115 | | - "TB compound naming conventions: first-line drugs (INH, RIF, EMB, PZA), " |
116 | | - "second-line drugs (Bedaquiline, Delamanid, Pretomanid), " |
117 | | - "pipeline compounds (BTZ043, PBTZ169, SQ109, Q203, TBA-7371, GSK656, DG167, NITD-304). " |
118 | | - "Capture all synonyms and code names as separate compound_name entities." |
| 70 | + "Include compound names, SMILES (only if explicitly written), CAS numbers, " |
| 71 | + "and molecular formulas. " |
| 72 | + "Mycobacterial proteins (ClpC1, DprE1, InhA, AtpE, etc.) are NOT compounds. " |
| 73 | + "Extract only what is explicitly stated; do not infer or generate values." |
119 | 74 | ) |
120 | 75 |
|
121 | 76 | TB_BIOLOGY_PROMPT = ( |
122 | | - "Extract biological target entities from this tuberculosis research text. " |
123 | | - "Focus on mycobacterial drug targets and their identifiers. " |
124 | | - "Key targets: DprE1, InhA, MmpL3, AtpE, ClpC1, ClpP1P2, QcrB, Pks13, KasA, " |
125 | | - "GyrA, GyrB, MbtA, EthA, PanC, LdtMt2, RpoB, PncA, EmbB, Ag85 complex. " |
126 | | - "These are biological targets, NOT compounds. " |
127 | | - "Use 'target' for proteins with drug-targeting context, 'gene_name' for gene loci " |
128 | | - "and gene symbols, 'protein_name' for other proteins. " |
129 | | - "Extract Rv locus tags and UniProt accessions as 'accession_number'. " |
130 | | - "Extract enzyme names and protein function descriptions as 'product'. " |
131 | | - "Extract functional categories (cell wall, lipid metabolism, virulence, etc.) " |
132 | | - "as 'functional_category'." |
| 77 | + "Extract biological entities from this tuberculosis research text. " |
| 78 | + "Use target for proteins in a drug-targeting context, gene_name for loci, " |
| 79 | + "protein_name for non-drug-target proteins. " |
| 80 | + "Rv locus tags and UniProt IDs are accession_number. " |
| 81 | + "Enzyme descriptions (e.g. 'enoyl-ACP reductase') are product. " |
| 82 | + "Protein functional categories (e.g. 'cell wall', 'lipid metabolism') " |
| 83 | + "are functional_category. " |
| 84 | + "Extract only what is explicitly stated; do not infer or generate values." |
133 | 85 | ) |
0 commit comments