prompt optimizations

sidxz · sidxz · commit 333b2cc2aace · 2026-02-14T23:39:23.000-06:00
diff --git a/structflo/ner/__init__.py b/structflo/ner/__init__.py
@@ -61,7 +61,7 @@
     EntityProfile,
 )
 
-__version__ = "0.2.0"
+__version__ = "0.2.1"
 
 __all__ = [
     # Main class
diff --git a/structflo/ner/_examples.py b/structflo/ner/_examples.py
@@ -606,16 +606,16 @@
             extraction_text="MIC of 3.1 uM",
             attributes={"value": "3.1", "unit": "uM", "assay_type": "MIC"},
         ),
-        lx.data.Extraction(
-            extraction_class="bioactivity",
-            extraction_text="CC50 >50 uM",
-            attributes={"value": ">50", "unit": "uM", "assay_type": "CC50"},
-        ),
         lx.data.Extraction(
             extraction_class="assay",
             extraction_text="HepG2 cells",
             attributes={"cell_line": "HepG2", "assay_format": "cytotoxicity"},
         ),
+        lx.data.Extraction(
+            extraction_class="bioactivity",
+            extraction_text="CC50 >50 uM",
+            attributes={"value": ">50", "unit": "uM", "assay_type": "CC50"},
+        ),
     ],
 )
 
diff --git a/structflo/ner/_prompts.py b/structflo/ner/_prompts.py
@@ -51,83 +51,35 @@
 # ── Tuberculosis early drug discovery prompts ──────────────────────────
 
 TB_PROMPT = (
-    "Extract drug discovery entities from this tuberculosis (TB) research text. "
-    "This is specialized for early-stage TB drug discovery: target identification, "
-    "hit finding, fragment screening, lead optimization, and in vitro/in vivo profiling.\n\n"
-    "COMPOUNDS:\n"
-    "- First-line drugs: Isoniazid (INH), Rifampicin (RIF), Ethambutol (EMB), Pyrazinamide (PZA).\n"
-    "- New-generation: Bedaquiline (TMC207), Delamanid (OPC-67683), Pretomanid (PA-824), "
-    "Linezolid, Clofazimine, Moxifloxacin.\n"
-    "- Pipeline: BTZ043, PBTZ169 (Macozinone), SQ109, Q203 (Telacebec), TBA-7371, "
-    "GSK656, OPC-167832, SPR720, BRD-8000, Sanfetrinem, DG167, NITD-304, NITD-349.\n"
-    "- Extract compound names (generic, code names, series IDs like 'Compound 14a'), "
-    "SMILES (only if explicitly present), CAS numbers, and molecular formulas.\n\n"
-    "BIOLOGICAL TARGETS:\n"
-    "- Mycobacterial proteins are biological targets, NOT compounds. Examples: "
-    "ClpC1, DprE1, InhA, MmpL3, AtpE, QcrB, Pks13, KasA, GyrA, GyrB, MbtA, "
-    "EthA, PanC, LdtMt2, RpoB, PncA, EmbB, Ag85.\n"
-    "- Use 'target' for proteins with drug-targeting context. "
-    "Use 'gene_name' for gene loci and gene symbols. "
-    "Use 'protein_name' for proteins without drug-targeting context.\n\n"
-    "ACCESSION NUMBERS:\n"
-    "- Rv locus tags (Rv3596c, Rv3790, Rv1484, Rv0206c, Rv1305, etc.), "
-    "UniProt accessions (P9WPS1, P9WGR1, etc.), PDB codes (6CQ4, 5V3Y, etc.).\n"
-    "- Extract each accession as a separate entity.\n\n"
-    "PRODUCTS:\n"
-    "- Gene product descriptions from databases: enzyme names, protein function descriptions "
-    "(e.g., 'enoyl-ACP reductase', 'ATP synthase subunit c', "
-    "'decaprenylphosphoryl-beta-D-ribose 2-epimerase').\n\n"
-    "FUNCTIONAL CATEGORIES:\n"
-    "- Mycobacterial protein functional categories: intermediary metabolism and respiration, "
-    "cell wall and cell processes, virulence/detoxification/adaptation, "
-    "lipid metabolism, information pathways, regulatory proteins, "
-    "PE/PPE family, conserved hypotheticals.\n\n"
-    "SCREENING METHODS:\n"
-    "- Early drug discovery screening approaches: affinity-based screening, "
-    "biochemical assay, DNA encoded library (DEL), fragment screening, "
-    "hypomorph screening, whole-cell phenotypic screening, "
-    "target-based HTS, virtual screening, SPR-based screening.\n\n"
-    "DISEASES:\n"
-    "- Tuberculosis variants: TB, MDR-TB, XDR-TB, pre-XDR-TB, TDR-TB, "
-    "LTBI, active TB, pulmonary TB, extrapulmonary TB, TB meningitis, miliary TB.\n"
-    "- Capture both full names and abbreviations as separate entities.\n\n"
-    "BIOACTIVITY:\n"
-    "- MIC (against H37Rv, Erdman, CDC1551, clinical isolates, MDR/XDR strains), "
-    "MIC90, MBC, IC50, EC50, Ki. "
-    "Capture numeric value, unit (ug/mL, uM, nM), measurement type, and strain context.\n\n"
-    "ASSAYS:\n"
-    "- MABA, LORA, REMA, macrophage infection (THP-1, J774, RAW264.7), "
-    "time-kill kinetics, checkerboard synergy, mouse acute/chronic infection models, "
-    "guinea pig aerosol model.\n\n"
-    "MECHANISMS OF ACTION:\n"
-    "- Mycolic acid biosynthesis inhibition, ATP synthase inhibition, "
-    "cell wall arabinan biosynthesis disruption, menaquinone biosynthesis inhibition, "
-    "trehalose monomycolate transport inhibition, covalent modification, "
-    "DNA gyrase inhibition, decaprenylphosphoryl-beta-D-ribose oxidation.\n\n"
+    "Extract drug discovery entities from this tuberculosis research text.\n\n"
+    "DISAMBIGUATION RULES:\n"
+    "- Mycobacterial proteins (e.g. ClpC1, DprE1, InhA, AtpE, MmpL3, QcrB) "
+    "are biological targets, NOT compounds.\n"
+    "- Rv locus tags (Rv3790, Rv1484), UniProt IDs (P9WPS1), and PDB codes "
+    "are accession_number, not target or gene_name.\n"
+    "- Enzyme descriptions like 'enoyl-ACP reductase' are product, not target.\n"
+    "- 'cell wall', 'lipid metabolism' are functional_category, not mechanism_of_action.\n"
+    "- 'fragment screening', 'biochemical assay' are screening_method, not assay.\n"
+    "- Use target for proteins in a drug-targeting context, gene_name for loci, "
+    "protein_name for non-drug-target proteins.\n\n"
     "Extract only what is explicitly stated; do not infer or generate values."
 )
 
 TB_CHEMISTRY_PROMPT = (
     "Extract chemical entities from this tuberculosis drug discovery text. "
-    "Include: compound names (generic names, IUPAC names, clinical codes like 'TMC207', "
-    "series identifiers like 'Compound 14a', brand names), "
-    "SMILES strings (only if explicitly written), CAS registry numbers, and molecular formulas. "
-    "TB compound naming conventions: first-line drugs (INH, RIF, EMB, PZA), "
-    "second-line drugs (Bedaquiline, Delamanid, Pretomanid), "
-    "pipeline compounds (BTZ043, PBTZ169, SQ109, Q203, TBA-7371, GSK656, DG167, NITD-304). "
-    "Capture all synonyms and code names as separate compound_name entities."
+    "Include compound names, SMILES (only if explicitly written), CAS numbers, "
+    "and molecular formulas. "
+    "Mycobacterial proteins (ClpC1, DprE1, InhA, AtpE, etc.) are NOT compounds. "
+    "Extract only what is explicitly stated; do not infer or generate values."
 )
 
 TB_BIOLOGY_PROMPT = (
-    "Extract biological target entities from this tuberculosis research text. "
-    "Focus on mycobacterial drug targets and their identifiers. "
-    "Key targets: DprE1, InhA, MmpL3, AtpE, ClpC1, ClpP1P2, QcrB, Pks13, KasA, "
-    "GyrA, GyrB, MbtA, EthA, PanC, LdtMt2, RpoB, PncA, EmbB, Ag85 complex. "
-    "These are biological targets, NOT compounds. "
-    "Use 'target' for proteins with drug-targeting context, 'gene_name' for gene loci "
-    "and gene symbols, 'protein_name' for other proteins. "
-    "Extract Rv locus tags and UniProt accessions as 'accession_number'. "
-    "Extract enzyme names and protein function descriptions as 'product'. "
-    "Extract functional categories (cell wall, lipid metabolism, virulence, etc.) "
-    "as 'functional_category'."
+    "Extract biological entities from this tuberculosis research text. "
+    "Use target for proteins in a drug-targeting context, gene_name for loci, "
+    "protein_name for non-drug-target proteins. "
+    "Rv locus tags and UniProt IDs are accession_number. "
+    "Enzyme descriptions (e.g. 'enoyl-ACP reductase') are product. "
+    "Protein functional categories (e.g. 'cell wall', 'lipid metabolism') "
+    "are functional_category. "
+    "Extract only what is explicitly stated; do not infer or generate values."
 )
diff --git a/structflo/ner/extractor.py b/structflo/ner/extractor.py
@@ -2,12 +2,16 @@
 
 from __future__ import annotations
 
+import logging
+
 import langextract as lx
 
 from structflo.ner._entities import NERResult
 from structflo.ner._mapping import annotated_doc_to_result
 from structflo.ner.profiles import FULL, EntityProfile
 
+logger = logging.getLogger(__name__)
+
 
 class NERExtractor:
     """Extract drug discovery entities from text with zero configuration.
@@ -109,8 +113,54 @@ def _build_examples(self, profile: EntityProfile) -> list[lx.data.ExampleData]:
         return profile.examples + self._extra_examples
 
     def _build_prompt(self, profile: EntityProfile) -> str:
-        """Return the prompt string for the given profile."""
-        return profile.prompt
+        """Return the prompt string for the given profile.
+
+        Appends an explicit schema constraint listing the allowed
+        entity classes.  This is critical for models that don't support
+        structured-output schemas (e.g. Ollama) where
+        ``use_schema_constraints`` has no effect.
+        """
+        classes = ", ".join(profile.entity_classes)
+        constraint = (
+            f"\n\nIMPORTANT — You MUST classify every extraction using "
+            f"ONLY these entity classes: [{classes}]. "
+            f"Do NOT invent new class names. Any extraction_class not in "
+            f"this list is an error."
+        )
+        return profile.prompt + constraint
+
+    @staticmethod
+    def _filter_extractions(
+        doc: lx.data.AnnotatedDocument,
+        allowed_classes: set[str],
+    ) -> lx.data.AnnotatedDocument:
+        """Drop extractions whose class is not in the profile's allowed set.
+
+        Models without schema-constraint support (e.g. Ollama) may invent
+        arbitrary entity classes.  This post-processing step removes those
+        hallucinated classes so they don't pollute the result.
+        """
+        kept: list[lx.data.Extraction] = []
+        for ext in doc.extractions:
+            if ext.extraction_class in allowed_classes:
+                kept.append(ext)
+            else:
+                logger.warning(
+                    "Dropping extraction with unknown class %r (text=%r). "
+                    "Allowed classes: %s",
+                    ext.extraction_class,
+                    ext.extraction_text,
+                    ", ".join(sorted(allowed_classes)),
+                )
+        return lx.data.AnnotatedDocument(
+            text=doc.text,
+            extractions=kept,
+        )
+
+    @property
+    def _is_ollama(self) -> bool:
+        """Return True when routing to an Ollama endpoint."""
+        return self._model_url is not None
 
     def _run_extraction(
         self,
@@ -125,6 +175,13 @@ def _run_extraction(
         kwargs.setdefault("use_schema_constraints", True)
         kwargs.setdefault("show_progress", False)
 
+        # Ollama defaults to num_ctx=2048 which is far too small for
+        # few-shot NER prompts.  Set a sane default so users don't hit
+        # silent truncation.
+        if self._is_ollama:
+            lm_params = kwargs.setdefault("language_model_params", {})
+            lm_params.setdefault("num_ctx", 8192)
+
         result = lx.extract(
             text_or_documents=text,
             prompt_description=prompt,
@@ -136,6 +193,8 @@ def _run_extraction(
         )
 
         # lx.extract returns a list when given a list; we always pass a single string
-        if isinstance(result, list):
-            return result[0]
-        return result
+        doc = result[0] if isinstance(result, list) else result
+
+        # Post-process: drop extractions with hallucinated entity classes
+        allowed = set(profile.entity_classes)
+        return self._filter_extractions(doc, allowed)
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
@@ -122,6 +122,54 @@ def test_source_text_preserved_in_result(self):
         assert result.source_text == "My source text"
 
 
+class TestBuildPrompt:
+    def test_prompt_includes_entity_class_constraint(self):
+        extractor = NERExtractor()
+        prompt = extractor._build_prompt(CHEMISTRY)
+        assert "ONLY these entity classes" in prompt
+        for cls in CHEMISTRY.entity_classes:
+            assert cls in prompt
+
+    def test_prompt_includes_all_tb_classes(self):
+        extractor = NERExtractor()
+        prompt = extractor._build_prompt(TB)
+        for cls in TB.entity_classes:
+            assert cls in prompt
+
+
+class TestFilterExtractions:
+    def test_keeps_valid_extractions(self):
+        doc = _make_annotated_doc(
+            [
+                lx.data.Extraction(extraction_class="compound_name", extraction_text="Aspirin"),
+                lx.data.Extraction(extraction_class="target", extraction_text="COX-2"),
+            ]
+        )
+        filtered = NERExtractor._filter_extractions(
+            doc, {"compound_name", "target"}
+        )
+        assert len(filtered.extractions) == 2
+
+    def test_drops_hallucinated_classes(self):
+        doc = _make_annotated_doc(
+            [
+                lx.data.Extraction(extraction_class="compound_name", extraction_text="Aspirin"),
+                lx.data.Extraction(extraction_class="enzyme", extraction_text="COX-2"),
+                lx.data.Extraction(extraction_class="toxicity", extraction_text="hepatotoxic"),
+            ]
+        )
+        filtered = NERExtractor._filter_extractions(
+            doc, {"compound_name", "target"}
+        )
+        assert len(filtered.extractions) == 1
+        assert filtered.extractions[0].extraction_text == "Aspirin"
+
+    def test_empty_extractions(self):
+        doc = _make_annotated_doc([])
+        filtered = NERExtractor._filter_extractions(doc, {"compound_name"})
+        assert len(filtered.extractions) == 0
+
+
 class TestBuildExamples:
     def test_extra_examples_appended(self):
         extra = lx.data.ExampleData(text="extra", extractions=[])

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@`
`61`	`61`	`EntityProfile,`
`62`	`62`	`)`
`63`	`63`
`64`		`-__version__ = "0.2.0"`
	`64`	`+__version__ = "0.2.1"`
`65`	`65`
`66`	`66`	`__all__ = [`
`67`	`67`	`# Main class`