Skip to content

Commit 333b2cc

Browse files
committed
prompt optimizations
1 parent c0a3e6b commit 333b2cc

5 files changed

Lines changed: 141 additions & 82 deletions

File tree

structflo/ner/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
EntityProfile,
6262
)
6363

64-
__version__ = "0.2.0"
64+
__version__ = "0.2.1"
6565

6666
__all__ = [
6767
# Main class

structflo/ner/_examples.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -606,16 +606,16 @@
606606
extraction_text="MIC of 3.1 uM",
607607
attributes={"value": "3.1", "unit": "uM", "assay_type": "MIC"},
608608
),
609-
lx.data.Extraction(
610-
extraction_class="bioactivity",
611-
extraction_text="CC50 >50 uM",
612-
attributes={"value": ">50", "unit": "uM", "assay_type": "CC50"},
613-
),
614609
lx.data.Extraction(
615610
extraction_class="assay",
616611
extraction_text="HepG2 cells",
617612
attributes={"cell_line": "HepG2", "assay_format": "cytotoxicity"},
618613
),
614+
lx.data.Extraction(
615+
extraction_class="bioactivity",
616+
extraction_text="CC50 >50 uM",
617+
attributes={"value": ">50", "unit": "uM", "assay_type": "CC50"},
618+
),
619619
],
620620
)
621621

structflo/ner/_prompts.py

Lines changed: 23 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -51,83 +51,35 @@
5151
# ── Tuberculosis early drug discovery prompts ──────────────────────────
5252

5353
TB_PROMPT = (
54-
"Extract drug discovery entities from this tuberculosis (TB) research text. "
55-
"This is specialized for early-stage TB drug discovery: target identification, "
56-
"hit finding, fragment screening, lead optimization, and in vitro/in vivo profiling.\n\n"
57-
"COMPOUNDS:\n"
58-
"- First-line drugs: Isoniazid (INH), Rifampicin (RIF), Ethambutol (EMB), Pyrazinamide (PZA).\n"
59-
"- New-generation: Bedaquiline (TMC207), Delamanid (OPC-67683), Pretomanid (PA-824), "
60-
"Linezolid, Clofazimine, Moxifloxacin.\n"
61-
"- Pipeline: BTZ043, PBTZ169 (Macozinone), SQ109, Q203 (Telacebec), TBA-7371, "
62-
"GSK656, OPC-167832, SPR720, BRD-8000, Sanfetrinem, DG167, NITD-304, NITD-349.\n"
63-
"- Extract compound names (generic, code names, series IDs like 'Compound 14a'), "
64-
"SMILES (only if explicitly present), CAS numbers, and molecular formulas.\n\n"
65-
"BIOLOGICAL TARGETS:\n"
66-
"- Mycobacterial proteins are biological targets, NOT compounds. Examples: "
67-
"ClpC1, DprE1, InhA, MmpL3, AtpE, QcrB, Pks13, KasA, GyrA, GyrB, MbtA, "
68-
"EthA, PanC, LdtMt2, RpoB, PncA, EmbB, Ag85.\n"
69-
"- Use 'target' for proteins with drug-targeting context. "
70-
"Use 'gene_name' for gene loci and gene symbols. "
71-
"Use 'protein_name' for proteins without drug-targeting context.\n\n"
72-
"ACCESSION NUMBERS:\n"
73-
"- Rv locus tags (Rv3596c, Rv3790, Rv1484, Rv0206c, Rv1305, etc.), "
74-
"UniProt accessions (P9WPS1, P9WGR1, etc.), PDB codes (6CQ4, 5V3Y, etc.).\n"
75-
"- Extract each accession as a separate entity.\n\n"
76-
"PRODUCTS:\n"
77-
"- Gene product descriptions from databases: enzyme names, protein function descriptions "
78-
"(e.g., 'enoyl-ACP reductase', 'ATP synthase subunit c', "
79-
"'decaprenylphosphoryl-beta-D-ribose 2-epimerase').\n\n"
80-
"FUNCTIONAL CATEGORIES:\n"
81-
"- Mycobacterial protein functional categories: intermediary metabolism and respiration, "
82-
"cell wall and cell processes, virulence/detoxification/adaptation, "
83-
"lipid metabolism, information pathways, regulatory proteins, "
84-
"PE/PPE family, conserved hypotheticals.\n\n"
85-
"SCREENING METHODS:\n"
86-
"- Early drug discovery screening approaches: affinity-based screening, "
87-
"biochemical assay, DNA encoded library (DEL), fragment screening, "
88-
"hypomorph screening, whole-cell phenotypic screening, "
89-
"target-based HTS, virtual screening, SPR-based screening.\n\n"
90-
"DISEASES:\n"
91-
"- Tuberculosis variants: TB, MDR-TB, XDR-TB, pre-XDR-TB, TDR-TB, "
92-
"LTBI, active TB, pulmonary TB, extrapulmonary TB, TB meningitis, miliary TB.\n"
93-
"- Capture both full names and abbreviations as separate entities.\n\n"
94-
"BIOACTIVITY:\n"
95-
"- MIC (against H37Rv, Erdman, CDC1551, clinical isolates, MDR/XDR strains), "
96-
"MIC90, MBC, IC50, EC50, Ki. "
97-
"Capture numeric value, unit (ug/mL, uM, nM), measurement type, and strain context.\n\n"
98-
"ASSAYS:\n"
99-
"- MABA, LORA, REMA, macrophage infection (THP-1, J774, RAW264.7), "
100-
"time-kill kinetics, checkerboard synergy, mouse acute/chronic infection models, "
101-
"guinea pig aerosol model.\n\n"
102-
"MECHANISMS OF ACTION:\n"
103-
"- Mycolic acid biosynthesis inhibition, ATP synthase inhibition, "
104-
"cell wall arabinan biosynthesis disruption, menaquinone biosynthesis inhibition, "
105-
"trehalose monomycolate transport inhibition, covalent modification, "
106-
"DNA gyrase inhibition, decaprenylphosphoryl-beta-D-ribose oxidation.\n\n"
54+
"Extract drug discovery entities from this tuberculosis research text.\n\n"
55+
"DISAMBIGUATION RULES:\n"
56+
"- Mycobacterial proteins (e.g. ClpC1, DprE1, InhA, AtpE, MmpL3, QcrB) "
57+
"are biological targets, NOT compounds.\n"
58+
"- Rv locus tags (Rv3790, Rv1484), UniProt IDs (P9WPS1), and PDB codes "
59+
"are accession_number, not target or gene_name.\n"
60+
"- Enzyme descriptions like 'enoyl-ACP reductase' are product, not target.\n"
61+
"- 'cell wall', 'lipid metabolism' are functional_category, not mechanism_of_action.\n"
62+
"- 'fragment screening', 'biochemical assay' are screening_method, not assay.\n"
63+
"- Use target for proteins in a drug-targeting context, gene_name for loci, "
64+
"protein_name for non-drug-target proteins.\n\n"
10765
"Extract only what is explicitly stated; do not infer or generate values."
10866
)
10967

11068
TB_CHEMISTRY_PROMPT = (
11169
"Extract chemical entities from this tuberculosis drug discovery text. "
112-
"Include: compound names (generic names, IUPAC names, clinical codes like 'TMC207', "
113-
"series identifiers like 'Compound 14a', brand names), "
114-
"SMILES strings (only if explicitly written), CAS registry numbers, and molecular formulas. "
115-
"TB compound naming conventions: first-line drugs (INH, RIF, EMB, PZA), "
116-
"second-line drugs (Bedaquiline, Delamanid, Pretomanid), "
117-
"pipeline compounds (BTZ043, PBTZ169, SQ109, Q203, TBA-7371, GSK656, DG167, NITD-304). "
118-
"Capture all synonyms and code names as separate compound_name entities."
70+
"Include compound names, SMILES (only if explicitly written), CAS numbers, "
71+
"and molecular formulas. "
72+
"Mycobacterial proteins (ClpC1, DprE1, InhA, AtpE, etc.) are NOT compounds. "
73+
"Extract only what is explicitly stated; do not infer or generate values."
11974
)
12075

12176
TB_BIOLOGY_PROMPT = (
122-
"Extract biological target entities from this tuberculosis research text. "
123-
"Focus on mycobacterial drug targets and their identifiers. "
124-
"Key targets: DprE1, InhA, MmpL3, AtpE, ClpC1, ClpP1P2, QcrB, Pks13, KasA, "
125-
"GyrA, GyrB, MbtA, EthA, PanC, LdtMt2, RpoB, PncA, EmbB, Ag85 complex. "
126-
"These are biological targets, NOT compounds. "
127-
"Use 'target' for proteins with drug-targeting context, 'gene_name' for gene loci "
128-
"and gene symbols, 'protein_name' for other proteins. "
129-
"Extract Rv locus tags and UniProt accessions as 'accession_number'. "
130-
"Extract enzyme names and protein function descriptions as 'product'. "
131-
"Extract functional categories (cell wall, lipid metabolism, virulence, etc.) "
132-
"as 'functional_category'."
77+
"Extract biological entities from this tuberculosis research text. "
78+
"Use target for proteins in a drug-targeting context, gene_name for loci, "
79+
"protein_name for non-drug-target proteins. "
80+
"Rv locus tags and UniProt IDs are accession_number. "
81+
"Enzyme descriptions (e.g. 'enoyl-ACP reductase') are product. "
82+
"Protein functional categories (e.g. 'cell wall', 'lipid metabolism') "
83+
"are functional_category. "
84+
"Extract only what is explicitly stated; do not infer or generate values."
13385
)

structflo/ner/extractor.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22

33
from __future__ import annotations
44

5+
import logging
6+
57
import langextract as lx
68

79
from structflo.ner._entities import NERResult
810
from structflo.ner._mapping import annotated_doc_to_result
911
from structflo.ner.profiles import FULL, EntityProfile
1012

13+
logger = logging.getLogger(__name__)
14+
1115

1216
class NERExtractor:
1317
"""Extract drug discovery entities from text with zero configuration.
@@ -109,8 +113,54 @@ def _build_examples(self, profile: EntityProfile) -> list[lx.data.ExampleData]:
109113
return profile.examples + self._extra_examples
110114

111115
def _build_prompt(self, profile: EntityProfile) -> str:
112-
"""Return the prompt string for the given profile."""
113-
return profile.prompt
116+
"""Return the prompt string for the given profile.
117+
118+
Appends an explicit schema constraint listing the allowed
119+
entity classes. This is critical for models that don't support
120+
structured-output schemas (e.g. Ollama) where
121+
``use_schema_constraints`` has no effect.
122+
"""
123+
classes = ", ".join(profile.entity_classes)
124+
constraint = (
125+
f"\n\nIMPORTANT — You MUST classify every extraction using "
126+
f"ONLY these entity classes: [{classes}]. "
127+
f"Do NOT invent new class names. Any extraction_class not in "
128+
f"this list is an error."
129+
)
130+
return profile.prompt + constraint
131+
132+
@staticmethod
133+
def _filter_extractions(
134+
doc: lx.data.AnnotatedDocument,
135+
allowed_classes: set[str],
136+
) -> lx.data.AnnotatedDocument:
137+
"""Drop extractions whose class is not in the profile's allowed set.
138+
139+
Models without schema-constraint support (e.g. Ollama) may invent
140+
arbitrary entity classes. This post-processing step removes those
141+
hallucinated classes so they don't pollute the result.
142+
"""
143+
kept: list[lx.data.Extraction] = []
144+
for ext in doc.extractions:
145+
if ext.extraction_class in allowed_classes:
146+
kept.append(ext)
147+
else:
148+
logger.warning(
149+
"Dropping extraction with unknown class %r (text=%r). "
150+
"Allowed classes: %s",
151+
ext.extraction_class,
152+
ext.extraction_text,
153+
", ".join(sorted(allowed_classes)),
154+
)
155+
return lx.data.AnnotatedDocument(
156+
text=doc.text,
157+
extractions=kept,
158+
)
159+
160+
@property
161+
def _is_ollama(self) -> bool:
162+
"""Return True when routing to an Ollama endpoint."""
163+
return self._model_url is not None
114164

115165
def _run_extraction(
116166
self,
@@ -125,6 +175,13 @@ def _run_extraction(
125175
kwargs.setdefault("use_schema_constraints", True)
126176
kwargs.setdefault("show_progress", False)
127177

178+
# Ollama defaults to num_ctx=2048 which is far too small for
179+
# few-shot NER prompts. Set a sane default so users don't hit
180+
# silent truncation.
181+
if self._is_ollama:
182+
lm_params = kwargs.setdefault("language_model_params", {})
183+
lm_params.setdefault("num_ctx", 8192)
184+
128185
result = lx.extract(
129186
text_or_documents=text,
130187
prompt_description=prompt,
@@ -136,6 +193,8 @@ def _run_extraction(
136193
)
137194

138195
# lx.extract returns a list when given a list; we always pass a single string
139-
if isinstance(result, list):
140-
return result[0]
141-
return result
196+
doc = result[0] if isinstance(result, list) else result
197+
198+
# Post-process: drop extractions with hallucinated entity classes
199+
allowed = set(profile.entity_classes)
200+
return self._filter_extractions(doc, allowed)

tests/test_extractor.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,54 @@ def test_source_text_preserved_in_result(self):
122122
assert result.source_text == "My source text"
123123

124124

125+
class TestBuildPrompt:
126+
def test_prompt_includes_entity_class_constraint(self):
127+
extractor = NERExtractor()
128+
prompt = extractor._build_prompt(CHEMISTRY)
129+
assert "ONLY these entity classes" in prompt
130+
for cls in CHEMISTRY.entity_classes:
131+
assert cls in prompt
132+
133+
def test_prompt_includes_all_tb_classes(self):
134+
extractor = NERExtractor()
135+
prompt = extractor._build_prompt(TB)
136+
for cls in TB.entity_classes:
137+
assert cls in prompt
138+
139+
140+
class TestFilterExtractions:
141+
def test_keeps_valid_extractions(self):
142+
doc = _make_annotated_doc(
143+
[
144+
lx.data.Extraction(extraction_class="compound_name", extraction_text="Aspirin"),
145+
lx.data.Extraction(extraction_class="target", extraction_text="COX-2"),
146+
]
147+
)
148+
filtered = NERExtractor._filter_extractions(
149+
doc, {"compound_name", "target"}
150+
)
151+
assert len(filtered.extractions) == 2
152+
153+
def test_drops_hallucinated_classes(self):
154+
doc = _make_annotated_doc(
155+
[
156+
lx.data.Extraction(extraction_class="compound_name", extraction_text="Aspirin"),
157+
lx.data.Extraction(extraction_class="enzyme", extraction_text="COX-2"),
158+
lx.data.Extraction(extraction_class="toxicity", extraction_text="hepatotoxic"),
159+
]
160+
)
161+
filtered = NERExtractor._filter_extractions(
162+
doc, {"compound_name", "target"}
163+
)
164+
assert len(filtered.extractions) == 1
165+
assert filtered.extractions[0].extraction_text == "Aspirin"
166+
167+
def test_empty_extractions(self):
168+
doc = _make_annotated_doc([])
169+
filtered = NERExtractor._filter_extractions(doc, {"compound_name"})
170+
assert len(filtered.extractions) == 0
171+
172+
125173
class TestBuildExamples:
126174
def test_extra_examples_appended(self):
127175
extra = lx.data.ExampleData(text="extra", extractions=[])

0 commit comments

Comments
 (0)