From e9c0093dfb97568d215803b90e8cb0b74059419e Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Mon, 17 Mar 2025 16:16:33 +0100
Subject: [PATCH 1/6] Update prompts to be a dataclass

---
 src/cell_annotator/_prompts.py         | 286 ++++++++++++++-----------
 src/cell_annotator/base_annotator.py   |   3 +-
 src/cell_annotator/cell_annotator.py   |  32 +--
 src/cell_annotator/sample_annotator.py |   8 +-
 4 files changed, 178 insertions(+), 151 deletions(-)

diff --git a/src/cell_annotator/_prompts.py b/src/cell_annotator/_prompts.py
index 1f46770..e335148 100644
--- a/src/cell_annotator/_prompts.py
+++ b/src/cell_annotator/_prompts.py
@@ -1,125 +1,167 @@
-"""Prompts for scRNA-seq cell annotation tasks."""
+from dataclasses import dataclass
 
+from cell_annotator._constants import PromptExamples
 
-class Prompts:
-    """Prompts for scRNA-seq cell annotation tasks.
-
-    These were in parts heavily inspired by https://github.com/VPetukhov/GPTCellAnnotator.
-    """
-
-    CELL_TYPE_PROMPT = (
-        "Provide me a comprehensive list of cell types that are expected in {species} {tissue} at stage `{stage}`."
-    )
-    CELL_TYPE_MARKER_PROMPT = "Now, for each cell type specify a list of {n_markers} marker genes that are specific to it. Make sure that you provided markers for **each** cell type you mentioned above."
-
-    ANNOTATION_PROMPT = """
-    You need to annotate a {species} {tissue} dataset at stage `{stage}`. You found gene markers for each cluster and need to determine clusters' identities.
-    Below is a short list of markers for each cluster:
-
-    {actual_markers_all}
-
-    We expect the following cell types (with associated marker genes) to be present in the dataset, however, some might be absent, and additionall cell types may also be present:
-    {expected_markers}
-
-    Determine cell type and state for cluster {cluster_id} (markers {actual_markers_cluster})
-
-    ### Output format:
-    ```
-    - marker_gene_description: description of what the markers mean. Example: markers A,B,C are associated with X, while D is related to Y
-    - cell_type: name of this cell type. If unknown, use 'Unknown'.
-    - cell_state: cell state if there is anything specific, 'Normal' otherwise
-    - annotation_confidence: one of 'Low', 'Medium', 'High'. If unknown, use 'Unknown'.
-    - reason_for_confidence_estimate: reason for the confidence estimate
-    ```
-    """.strip()
-
-    ORDER_PROMPT = """
-    You are tasked with reordering cell type annotations for global consistency. The goal is to align the order of labels based on biological relationships and similarities.
-
-    Here are the rules you must follow:
-    1. You must include all original elements in the reordered list.
-    2. You cannot change the names of any labels. Use the labels exactly as they are provided. Keep all duplicates.
-    3. The order must be biologically meaningful.
-
-    Biologically meaningful order means grouping similar cell types together based on their lineage, function, or tissue origin. For example, immune cells should be grouped together, and different types of muscle cells should be grouped together.
-
-    Below are the current cell type annotations, in their original, random order:
-    {unique_cell_types}
-
-    Reorder these labels into a biologically meaningful order.
-
-    ### Example:
-    If you are provided with the following list of cell type labels:
-    {example_unordered}
-
-    A possible re-ordering could be:
-    {example_ordered}
-
-    ### Output format:
-    Provide the reordered cell type annotations as a list with no additional commnets.
-    ```
-    """.strip()
-
-    DUPLICATE_REMOVAL_PROMPT = """
-    You need to remove duplicates from a list of cell type annotations. The same cell type might be included multiple times in the list, for example, with different capitalization, abbreviations, or synonyms. Thus, by duplicates, we mean the same, or an extremely simlar cell type, being represented by two or more elements in the list. The goal is to ensure that each cell type is only represented once in the list. Below are the current cell type annotations, which may contain duplicates:
 
-    {list_with_duplicates}
-
-    Remove any such duplicates from the list.
-
-    ### Example:
-    If you are provided with the following list of cell type labels:
-    ["Natural killer cells", "natural killer cells", "NK cells", "Natural killer cells (NK cells) "T cells", "T-cells", "B cells", "B-cells"]
-
-    A possible de-duplicated list could be:
-    ["Natural killer cells (NK cells)", "T cells", "B cells"]
-
-    ### Output format:
-    Provide the updated cell type annotations as a list with no additional comments.
-    """.strip()
-
-    MAPPING_PROMPT = """
-    You're given two lists of cell type annotations. The first list, called 'cell_types_local', contains the cell type annotations from a local dataset. The second list, called 'cell_types_global', contains a unique set of cell type annotations that are used globally. Your task is to map the local cell type annotations to the global cell type annotations. Here are both lists:
-
-    Local cell type annotations:
-    {local_cell_type_list}
-
-    Global cell type annotations:
-    {global_cell_type_list}
-
-    Now, map the following item from the local list to the global list: {current_cell_type}.
-
-    Follow these rules:
-    1. Map the local cell type annotation to the global cell type annotation that best represents it.
-    2. Do not modify the corresponding entry from the global cell type annotation list in any way. Use the labels exactly as they are provided.
-    """.strip()
-
-    COLOR_PROMPT = """
-    You need to assign meaningful colors to the following cell type labels:
-
-    {cluster_names}
-
-    Follow these rules:
-    1. Use colors that are biologically meaningful: similar cell types should have similar colors (e.g., shades of the same color family, which are still easy to distinguish by eye), and unrelated cell types should have distinct colors.
-    3. Use hexadecimal color codes (e.g., "#1f77b4").
-    4. Do not use white, black, or grey colors.
-    5. Do not modify the order of the cell type labels.
-    6. Include all labels in the color assignmeent, and do not modify them in any way.
-
-    ### Example:
-    If the cell type annotations are:
-        {example_cell_types}
-    A possible color assignment could be:
-        {example_color_assignment}
-
-    ### Output format:
-    For each cell type, provide output in the following format:
-    ```
-    - original_cell_type_label: the original cell type label
-    - assigned_color: the color assigned to this cell type
-    ```
-    """.strip()
-
-    AGENT_DESCRIPTION = (
-        "You're an expert bioinformatician, proficient in scRNA-seq analysis with background in {species} cell biology."
-    )
+@dataclass
+class Prompts:
+    """Prompts for scRNA-seq cell annotation tasks."""
+
+    species: str = ""
+    tissue: str = ""
+    stage: str = ""
+
+    def get_cell_type_prompt(self) -> str:
+        """Generate the cell type prompt."""
+        return f"Provide me a comprehensive list of cell types that are expected in {self.species} {self.tissue} at stage `{self.stage}`."
+
+    def get_cell_type_marker_prompt(self, n_markers: int) -> str:
+        """Generate the cell type marker prompt."""
+        return f"Now, for each cell type specify a list of {n_markers} marker genes that are specific to it. Make sure that you provided markers for **each** cell type you mentioned above."
+
+    def get_annotation_prompt(
+        self,
+        actual_markers_all: str,
+        expected_markers: str,
+        cluster_id: str,
+        actual_markers_cluster: str,
+        restrict_to_expected: bool = False,
+    ) -> str:
+        """Generate the annotation prompt."""
+        restriction_clause = (
+            "You must only use labels from the expected cell types listed above."
+            if restrict_to_expected
+            else "Some expected cell types might be absent, and additional cell types may also be present."
+        )
+
+        return f"""
+        You need to annotate a {self.species} {self.tissue} dataset at stage `{self.stage}`. You found gene markers for each cluster and need to determine clusters' identities.
+        Below is a short list of markers for each cluster:
+
+        {actual_markers_all}
+
+        We expect the following cell types (with associated marker genes) to be present in the dataset:
+        {expected_markers}
+
+        {restriction_clause}
+
+        Determine cell type and state for cluster {cluster_id} (markers {actual_markers_cluster})
+
+        ### Output format:
+        ```
+        - marker_gene_description: description of what the markers mean. Example: markers A,B,C are associated with X, while D is related to Y
+        - cell_type: name of this cell type. If unknown, use 'Unknown'.
+        - cell_state: cell state if there is anything specific, 'Normal' otherwise
+        - annotation_confidence: one of 'Low', 'Medium', 'High'. If unknown, use 'Unknown'.
+        - reason_for_confidence_estimate: reason for the confidence estimate
+        ```
+        """.strip()
+
+    def get_order_prompt(
+        self,
+        unique_cell_types: str,
+        example_unordered: str | None = None,
+        example_ordered: str | None = None,
+    ) -> str:
+        """Generate the order prompt."""
+        if example_unordered is None:
+            example_unordered = ", ".join(PromptExamples.unordered_cell_types)
+        if example_ordered is None:
+            example_ordered = ", ".join(PromptExamples.ordered_cell_types)
+
+        return f"""
+        You are tasked with reordering cell type annotations for global consistency. The goal is to align the order of labels based on biological relationships and similarities.
+
+        Here are the rules you must follow:
+        1. You must include all original elements in the reordered list.
+        2. You cannot change the names of any labels. Use the labels exactly as they are provided. Keep all duplicates.
+        3. The order must be biologically meaningful.
+
+        Biologically meaningful order means grouping similar cell types together based on their lineage, function, or tissue origin. For example, immune cells should be grouped together, and different types of muscle cells should be grouped together.
+
+        Below are the current cell type annotations, in their original, random order:
+        {unique_cell_types}
+
+        Reorder these labels into a biologically meaningful order.
+
+        ### Example:
+        If you are provided with the following list of cell type labels:
+        {example_unordered}
+
+        A possible re-ordering could be:
+        {example_ordered}
+
+        ### Output format:
+        Provide the reordered cell type annotations as a list with no additional comments.
+        """.strip()
+
+    def get_duplicate_removal_prompt(self, list_with_duplicates: str) -> str:
+        """Generate the duplicate removal prompt."""
+        return f"""
+        You need to remove duplicates from a list of cell type annotations. The same cell type might be included multiple times in the list, for example, with different capitalization, abbreviations, or synonyms. Thus, by duplicates, we mean the same, or an extremely similar cell type, being represented by two or more elements in the list. The goal is to ensure that each cell type is only represented once in the list. Below are the current cell type annotations, which may contain duplicates:
+
+        {list_with_duplicates}
+
+        Remove any such duplicates from the list.
+
+        ### Output format:
+        Provide the updated cell type annotations as a list with no additional comments.
+        """.strip()
+
+    def get_mapping_prompt(self, local_cell_type_list: str, global_cell_type_list: str, current_cell_type: str) -> str:
+        """Generate the mapping prompt."""
+        return f"""
+        You're given two lists of cell type annotations. The first list, called 'cell_types_local', contains the cell type annotations from a local dataset. The second list, called 'cell_types_global', contains a unique set of cell type annotations that are used globally. Your task is to map the local cell type annotations to the global cell type annotations. Here are both lists:
+
+        Local cell type annotations:
+        {local_cell_type_list}
+
+        Global cell type annotations:
+        {global_cell_type_list}
+
+        Now, map the following item from the local list to the global list: {current_cell_type}.
+
+        Follow these rules:
+        1. Map the local cell type annotation to the global cell type annotation that best represents it.
+        2. Do not modify the corresponding entry from the global cell type annotation list in any way. Use the labels exactly as they are provided.
+        """.strip()
+
+    def get_color_prompt(
+        self, cluster_names: str, example_cell_types: str | None = None, example_color_assignment: str | None = None
+    ) -> str:
+        """Generate the color assignment prompt."""
+        if example_cell_types is None:
+            example_cell_types = ", ".join(PromptExamples.color_mapping_example.keys())
+        if example_color_assignment is None:
+            example_color_assignment = "; ".join(
+                f"{key}: {value}" for key, value in PromptExamples.color_mapping_example.items()
+            )
+        return f"""
+        You need to assign meaningful colors to the following cell type labels:
+
+        {cluster_names}
+
+        Follow these rules:
+        1. Use colors that are biologically meaningful: similar cell types should have similar colors (e.g., shades of the same color family, which are still easy to distinguish by eye), and unrelated cell types should have distinct colors.
+        3. Use hexadecimal color codes (e.g., "#1f77b4").
+        4. Do not use white, black, or grey colors.
+        5. Do not modify the order of the cell type labels.
+        6. Include all labels in the color assignment, and do not modify them in any way.
+
+        ### Example:
+        If the cell type annotations are:
+            {example_cell_types}
+        A possible color assignment could be:
+            {example_color_assignment}
+
+        ### Output format:
+        For each cell type, provide output in the following format:
+        ```
+        - original_cell_type_label: the original cell type label
+        - assigned_color: the color assigned to this cell type
+        ```
+        """.strip()
+
+    def get_agent_description(self) -> str:
+        """Generate the agent description."""
+        return f"You're an expert bioinformatician, proficient in scRNA-seq analysis with background in {self.species} cell biology."
diff --git a/src/cell_annotator/base_annotator.py b/src/cell_annotator/base_annotator.py
index 47bc256..af521cc 100644
--- a/src/cell_annotator/base_annotator.py
+++ b/src/cell_annotator/base_annotator.py
@@ -41,6 +41,7 @@ def __init__(
         self.cluster_key = cluster_key
         self.model = model
         self.max_completion_tokens = max_completion_tokens
+        self.prompts = Prompts(species=species, tissue=tissue, stage=stage)
 
     def query_openai(
         self,
@@ -64,7 +65,7 @@ def query_openai(
         -------
         Parsed response.
         """
-        agent_description = Prompts.AGENT_DESCRIPTION.format(species=self.species)
+        agent_description = self.prompts.get_cell_type_prompt()
 
         response = _query_openai(
             agent_description=agent_description,
diff --git a/src/cell_annotator/cell_annotator.py b/src/cell_annotator/cell_annotator.py
index aeda423..4baf491 100644
--- a/src/cell_annotator/cell_annotator.py
+++ b/src/cell_annotator/cell_annotator.py
@@ -7,9 +7,8 @@
 from scanpy.tools._rank_genes_groups import _Method
 from tqdm.auto import tqdm
 
-from cell_annotator._constants import PackageConstants, PromptExamples
+from cell_annotator._constants import PackageConstants
 from cell_annotator._logging import logger
-from cell_annotator._prompts import Prompts
 from cell_annotator._response_formats import CellTypeColorOutput, CellTypeListOutput, ExpectedMarkerGeneOutput
 from cell_annotator.base_annotator import BaseAnnotator
 from cell_annotator.sample_annotator import SampleAnnotator
@@ -133,11 +132,9 @@ def get_expected_cell_type_markers(self, n_markers: int = 5) -> None:
         - `self.expected_cell_types`
         - `self.expected_marker_genes`
         """
-        cell_type_prompt = Prompts.CELL_TYPE_PROMPT.format(species=self.species, tissue=self.tissue, stage=self.stage)
-
         logger.info("Querying cell types.")
         res_types = self.query_openai(
-            instruction=cell_type_prompt,
+            instruction=self.prompts.get_cell_type_prompt(),
             response_format=CellTypeListOutput,
         )
 
@@ -146,12 +143,12 @@ def get_expected_cell_type_markers(self, n_markers: int = 5) -> None:
 
         marker_gene_prompt = [
             {"role": "assistant", "content": "; ".join(self.expected_cell_types) if self.expected_cell_types else ""},
-            {"role": "user", "content": Prompts.CELL_TYPE_MARKER_PROMPT.format(n_markers=n_markers)},
+            {"role": "user", "content": self.prompts.get_cell_type_marker_prompt(n_markers)},
         ]
 
         logger.info("Querying cell type markers.")
         res_markers = self.query_openai(
-            instruction=cell_type_prompt,
+            instruction=self.prompts.get_cell_type_prompt(),
             other_messages=marker_gene_prompt,
             response_format=ExpectedMarkerGeneOutput,
         )
@@ -297,7 +294,7 @@ def _harmonize_annotations(self, unknown_key: str = PackageConstants.unknown_nam
             categories = annotator.annotation_df["cell_type"].unique()
             cell_types.update(cat for cat in categories if cat != unknown_key)
 
-        deduplication_prompt = Prompts.DUPLICATE_REMOVAL_PROMPT.format(list_with_duplicates=", ".join(cell_types))
+        deduplication_prompt = self.prompts.get_duplicate_removal_prompt(list_with_duplicates=", ".join(cell_types))
 
         # query openai
         logger.info("Querying cell-type label de-duplication.")
@@ -350,7 +347,7 @@ def reorder_and_color_clusters(
         if assign_colors:
             global_names_and_colors = self._get_cluster_colors(clusters=cell_type_list, unknown_key=unknown_key)
         else:
-            global_names_and_colors = {cell_type: "" for cell_type in cell_type_list}
+            global_names_and_colors = dict.fromkeys(cell_type_list, "")
 
         label_sets = _get_consistent_ordering(self.adata, global_names_and_colors, keys)
 
@@ -386,11 +383,7 @@ def _get_cluster_ordering(self, keys: list[str], unknown_key: str = PackageConst
         """
         # format the current annotations sets as a string and prepare the query prompt
         unique_cell_types = _get_unique_cell_types(self.adata, keys, unknown_key)
-        order_prompt = Prompts.ORDER_PROMPT.format(
-            unique_cell_types=", ".join(unique_cell_types),
-            example_unordered=PromptExamples.unordered_cell_types,
-            example_ordered=PromptExamples.ordered_cell_types,
-        )
+        order_prompt = self.prompts.get_order_prompt(unique_cell_types=", ".join(unique_cell_types))
 
         # query openai and format the response as a dict
         logger.info("Querying label ordering.")
@@ -428,16 +421,11 @@ def _get_cluster_colors(
             raise ValueError(f"Invalid type for 'clusters': {type(clusters)}")
 
         cluster_names = ", ".join(cl for cl in cluster_list if cl != unknown_key)
-        color_prompt = Prompts.COLOR_PROMPT.format(
-            cluster_names=cluster_names,
-            example_cell_types=", ".join(PromptExamples.color_mapping_example.keys()),
-            example_color_assignment="; ".join(
-                f"{key}: {value}" for key, value in PromptExamples.color_mapping_example.items()
-            ),
-        )
 
         logger.info("Querying cluster colors.")
-        response = self.query_openai(instruction=color_prompt, response_format=CellTypeColorOutput)
+        response = self.query_openai(
+            instruction=self.prompts.get_color_prompt(cluster_names), response_format=CellTypeColorOutput
+        )
         color_dict = {
             item.original_cell_type_label: item.assigned_color for item in response.cell_type_to_color_mapping
         }
diff --git a/src/cell_annotator/sample_annotator.py b/src/cell_annotator/sample_annotator.py
index d66ef06..2bf1970 100644
--- a/src/cell_annotator/sample_annotator.py
+++ b/src/cell_annotator/sample_annotator.py
@@ -8,7 +8,6 @@
 
 from cell_annotator._constants import PackageConstants
 from cell_annotator._logging import logger
-from cell_annotator._prompts import Prompts
 from cell_annotator._response_formats import BaseOutput, CellTypeMappingOutput, PredictedCellTypeOutput
 from cell_annotator.base_annotator import BaseAnnotator
 from cell_annotator.utils import _filter_by_category_size, _get_auc, _get_specificity, _try_sorting_dict_by_keys
@@ -219,10 +218,7 @@ def annotate_clusters(self, min_markers: int, expected_marker_genes: dict[str, l
                 actual_markers_cluster_string = ", ".join(actual_markers_cluster)
 
                 # fill in the annotation prompt
-                annotation_prompt = Prompts.ANNOTATION_PROMPT.format(
-                    species=self.species,
-                    tissue=self.tissue,
-                    stage=self.stage,
+                annotation_prompt = self.prompts.get_annotation_prompt(
                     actual_markers_all=actual_markers_all,
                     cluster_id=cluster,
                     actual_markers_cluster=actual_markers_cluster_string,
@@ -273,7 +269,7 @@ def harmonize_annotations(
 
         logger.debug("Iterating over clusters to map local annotations to global naming scheme.")
         for cat in local_cell_type_mapping.keys():
-            mapping_prompt = Prompts.MAPPING_PROMPT.format(
+            mapping_prompt = self.prompts.get_mapping_prompt(
                 global_cell_type_list=", ".join(global_cell_type_list),
                 local_cell_type_list=", ".join(local_cell_type_mapping.keys()),
                 current_cell_type=cat,

From f4ca9d2fc8c8cbf0d1c8897046666074ed6f3f3e Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Mon, 17 Mar 2025 16:25:56 +0100
Subject: [PATCH 2/6] Fix tests

---
 tests/test_base_annotator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_base_annotator.py b/tests/test_base_annotator.py
index 77b6de0..6ef85c0 100644
--- a/tests/test_base_annotator.py
+++ b/tests/test_base_annotator.py
@@ -2,7 +2,6 @@
 
 import pytest
 
-from cell_annotator._prompts import Prompts
 from cell_annotator._response_formats import OutputForTesting
 from cell_annotator.base_annotator import BaseAnnotator
 
@@ -24,7 +23,7 @@ def test_query_openai(self, mock_query_openai, base_annotator):
         mock_response = OutputForTesting(parsed_response="parsed_response")
         mock_query_openai.return_value = mock_response
 
-        agent_description = Prompts.AGENT_DESCRIPTION.format(species="human")
+        agent_description = base_annotator.prompts.get_agent_description()
         response = base_annotator.query_openai(instruction="Test instruction", response_format=OutputForTesting)
 
         print("Agent Description:", agent_description)

From 310bc6f575e2f61f7a4419900ac8dcc51577e318 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Mon, 17 Mar 2025 17:00:52 +0100
Subject: [PATCH 3/6] Introduce a restrict_to_expected parameter

---
 README.md                              | 12 ++++++------
 src/cell_annotator/cell_annotator.py   | 12 ++++++++++--
 src/cell_annotator/sample_annotator.py |  7 ++++++-
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index be3860a..4f457bd 100644
--- a/README.md
+++ b/README.md
@@ -12,15 +12,15 @@
 [badge-pre-commit]: https://results.pre-commit.ci/badge/github/quadbio/cell-annotator/main.svg
 [badge-pypi]: https://img.shields.io/pypi/v/cell-annotator.svg
 
-A tool to annotate cell types in scRNA-seq data based on marker genes using OpenAI models.
+CellAnnotator is an [scverse ecosystem package](https://scverse.org/packages/#ecosystem), designed to annotate cell types in scRNA-seq data based on marker genes using OpenAI models.
 
 ## Key features
 
-- Automatically annotate cells including type, state and confidence fields.
-- Generate consistent annotations across samples of your study.
-- Optionally infuse prior knowledge by providing information about your biological system.
-- Retrieve reliable results thanks to [OpenAI structured outputs](https://platform.openai.com/docs/guides/structured-outputs)
-- Use this tool to quickly generate pre-integration cell type labels to either score your integration quality (e.g. [scIB metrics](https://scib-metrics.readthedocs.io/en/stable/)) or to guide your integration effort (e.g. [scPoli](https://docs.scarches.org/en/latest/), [scANVI](https://docs.scvi-tools.org/en/stable/api/reference/scvi.model.SCANVI.html)).
+-   Automatically annotate cells including type, state and confidence fields.
+-   Generate consistent annotations across samples of your study.
+-   Optionally infuse prior knowledge by providing information about your biological system.
+-   Retrieve reliable results thanks to [OpenAI structured outputs](https://platform.openai.com/docs/guides/structured-outputs)
+-   Use this tool to quickly generate pre-integration cell type labels to either score your integration quality (e.g. [scIB metrics](https://scib-metrics.readthedocs.io/en/stable/)) or to guide your integration effort (e.g. [scPoli](https://docs.scarches.org/en/latest/), [scANVI](https://docs.scvi-tools.org/en/stable/api/reference/scvi.model.SCANVI.html)).
 
 Note that this package is based on output generated by large language models and might **sometimes make mistakes**. We use some safeguards, like anchoring the tool in a multi-step process, and using structured output predictions, but mistakes are still possible. We recommend using this tool as a first step in an annotation workflow to generate an initial, coarse set of annotations that must be further refined.
 
diff --git a/src/cell_annotator/cell_annotator.py b/src/cell_annotator/cell_annotator.py
index e16493d..c460645 100644
--- a/src/cell_annotator/cell_annotator.py
+++ b/src/cell_annotator/cell_annotator.py
@@ -208,7 +208,9 @@ def get_cluster_markers(
                 use_rapids=use_rapids,
             )
 
-    def annotate_clusters(self, min_markers: int = 2, key_added: str = "cell_type_predicted"):
+    def annotate_clusters(
+        self, min_markers: int = 2, restrict_to_expected: bool = False, key_added: str = "cell_type_predicted"
+    ):
         """Annotate clusters based on marker genes.
 
         Parameters
@@ -217,6 +219,8 @@ def annotate_clusters(self, min_markers: int = 2, key_added: str = "cell_type_pr
             Minimal number of required marker genes per cluster.
         key_added
             Name of the key in .obs where updated annotations will be written
+        restrict_to_expected
+            If True, only use expected cell types for annotation.
 
         Returns
         -------
@@ -234,7 +238,11 @@ def annotate_clusters(self, min_markers: int = 2, key_added: str = "cell_type_pr
 
         logger.info("Iterating over samples to annotate clusters. ")
         for annotator in tqdm(self.sample_annotators.values()):
-            annotator.annotate_clusters(min_markers=min_markers, expected_marker_genes=self.expected_marker_genes)
+            annotator.annotate_clusters(
+                min_markers=min_markers,
+                restrict_to_expected=restrict_to_expected,
+                expected_marker_genes=self.expected_marker_genes,
+            )
 
         # set the annotated flag to True
         self.annotated = True
diff --git a/src/cell_annotator/sample_annotator.py b/src/cell_annotator/sample_annotator.py
index ee9df83..3406880 100644
--- a/src/cell_annotator/sample_annotator.py
+++ b/src/cell_annotator/sample_annotator.py
@@ -195,7 +195,9 @@ def _filter_cluster_markers(self, min_auc: float, max_markers: int) -> None:
 
         self.marker_genes = _try_sorting_dict_by_keys(marker_genes)
 
-    def annotate_clusters(self, min_markers: int, expected_marker_genes: dict[str, list[str]] | None) -> None:
+    def annotate_clusters(
+        self, min_markers: int, expected_marker_genes: dict[str, list[str]] | None, restrict_to_expected: bool = False
+    ) -> None:
         """Annotate clusters based on marker genes.
 
         Parameters
@@ -206,6 +208,8 @@ def annotate_clusters(self, min_markers: int, expected_marker_genes: dict[str, l
             Minimum number of requires marker genes per cluster.
         expected_marker_genes
             Expected marker genes per cell type.
+        restrict_to_expected
+            If True, only use expected cell types for annotation.
 
         Returns
         -------
@@ -248,6 +252,7 @@ def annotate_clusters(self, min_markers: int, expected_marker_genes: dict[str, l
                     cluster_id=cluster,
                     actual_markers_cluster=actual_markers_cluster_string,
                     expected_markers=expected_markers_string,
+                    restrict_to_expected=restrict_to_expected,
                 )
 
                 self.annotation_dict[cluster] = self.query_openai(

From 637166706691926e14b5d81e7db2150423771180 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:01:23 +0000
Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 4f457bd..f4fe06e 100644
--- a/README.md
+++ b/README.md
@@ -16,11 +16,11 @@ CellAnnotator is an [scverse ecosystem package](https://scverse.org/packages/#ec
 
 ## Key features
 
--   Automatically annotate cells including type, state and confidence fields.
--   Generate consistent annotations across samples of your study.
--   Optionally infuse prior knowledge by providing information about your biological system.
--   Retrieve reliable results thanks to [OpenAI structured outputs](https://platform.openai.com/docs/guides/structured-outputs)
--   Use this tool to quickly generate pre-integration cell type labels to either score your integration quality (e.g. [scIB metrics](https://scib-metrics.readthedocs.io/en/stable/)) or to guide your integration effort (e.g. [scPoli](https://docs.scarches.org/en/latest/), [scANVI](https://docs.scvi-tools.org/en/stable/api/reference/scvi.model.SCANVI.html)).
+- Automatically annotate cells including type, state and confidence fields.
+- Generate consistent annotations across samples of your study.
+- Optionally infuse prior knowledge by providing information about your biological system.
+- Retrieve reliable results thanks to [OpenAI structured outputs](https://platform.openai.com/docs/guides/structured-outputs)
+- Use this tool to quickly generate pre-integration cell type labels to either score your integration quality (e.g. [scIB metrics](https://scib-metrics.readthedocs.io/en/stable/)) or to guide your integration effort (e.g. [scPoli](https://docs.scarches.org/en/latest/), [scANVI](https://docs.scvi-tools.org/en/stable/api/reference/scvi.model.SCANVI.html)).
 
 Note that this package is based on output generated by large language models and might **sometimes make mistakes**. We use some safeguards, like anchoring the tool in a multi-step process, and using structured output predictions, but mistakes are still possible. We recommend using this tool as a first step in an annotation workflow to generate an initial, coarse set of annotations that must be further refined.
 

From ba0f725dcd7e1b6cfe7cc38c782a31b052485c01 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Mon, 17 Mar 2025 17:03:34 +0100
Subject: [PATCH 5/6] Update changelog

---
 CHANGELOG.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8a11175..19a7dc0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,43 +15,45 @@ and this project adheres to [Semantic Versioning][]. Full commit history is avai
 
 #### Added
 
-- Use `rapids_singlecell`, `cupy` and `cuml` to accelerate cluster marker computation on GPUs {pr}`37`.
+-   Use `rapids_singlecell`, `cupy` and `cuml` to accelerate cluster marker computation on GPUs {pr}`37`.
+-   Add the possibility to softly enforce adherence to expected cell types {pr}`42`.
 
 #### Changed
 
-- Run cluster label harmonization also for a single sample {pr}`37`.
+-   Run cluster label harmonization also for a single sample {pr}`37`.
+-   Re-format prompts into a dataclass {pr}`42`.
 
 #### Fixed
 
-- Fixed a bug with integer sample labels {pr}`37`.
+-   Fixed a bug with integer sample labels {pr}`37`.
 
 ### 0.1.3 (2025-02-07)
 
 #### Added
 
-- Added tests for the single-sample case {pr}`29`.
-- Refer to issues and PRs with sphinx {pr}`30`.
+-   Added tests for the single-sample case {pr}`29`.
+-   Refer to issues and PRs with sphinx {pr}`30`.
 
 #### Removed
 
-- Removed `tenacity` for query retries {pr}`28`.
+-   Removed `tenacity` for query retries {pr}`28`.
 
 #### Fixed
 
-- Fixed `_get_annotation_summary_string` for the single-sample case {pr}`29`.
-- Fixed the expected cell type marker test by adding additional marker genes {pr}`28`.
+-   Fixed `_get_annotation_summary_string` for the single-sample case {pr}`29`.
+-   Fixed the expected cell type marker test by adding additional marker genes {pr}`28`.
 
 ### 0.1.2 (2025-01-29)
 
 #### Added
 
-- Update the documentation, in particular the installation instructions.
+-   Update the documentation, in particular the installation instructions.
 
 ### 0.1.1 (2025-01-29)
 
 #### Added
 
-- Initial push to PyPI
+-   Initial push to PyPI
 
 ### 0.1.0 (2025-01-29)
 

From 3157016d1c14e89fe64d230aef71f124d50d9a20 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:04:40 +0000
Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19a7dc0..fbe0cbb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,45 +15,45 @@ and this project adheres to [Semantic Versioning][]. Full commit history is avai
 
 #### Added
 
--   Use `rapids_singlecell`, `cupy` and `cuml` to accelerate cluster marker computation on GPUs {pr}`37`.
--   Add the possibility to softly enforce adherence to expected cell types {pr}`42`.
+- Use `rapids_singlecell`, `cupy` and `cuml` to accelerate cluster marker computation on GPUs {pr}`37`.
+- Add the possibility to softly enforce adherence to expected cell types {pr}`42`.
 
 #### Changed
 
--   Run cluster label harmonization also for a single sample {pr}`37`.
--   Re-format prompts into a dataclass {pr}`42`.
+- Run cluster label harmonization also for a single sample {pr}`37`.
+- Re-format prompts into a dataclass {pr}`42`.
 
 #### Fixed
 
--   Fixed a bug with integer sample labels {pr}`37`.
+- Fixed a bug with integer sample labels {pr}`37`.
 
 ### 0.1.3 (2025-02-07)
 
 #### Added
 
--   Added tests for the single-sample case {pr}`29`.
--   Refer to issues and PRs with sphinx {pr}`30`.
+- Added tests for the single-sample case {pr}`29`.
+- Refer to issues and PRs with sphinx {pr}`30`.
 
 #### Removed
 
--   Removed `tenacity` for query retries {pr}`28`.
+- Removed `tenacity` for query retries {pr}`28`.
 
 #### Fixed
 
--   Fixed `_get_annotation_summary_string` for the single-sample case {pr}`29`.
--   Fixed the expected cell type marker test by adding additional marker genes {pr}`28`.
+- Fixed `_get_annotation_summary_string` for the single-sample case {pr}`29`.
+- Fixed the expected cell type marker test by adding additional marker genes {pr}`28`.
 
 ### 0.1.2 (2025-01-29)
 
 #### Added
 
--   Update the documentation, in particular the installation instructions.
+- Update the documentation, in particular the installation instructions.
 
 ### 0.1.1 (2025-01-29)
 
 #### Added
 
--   Initial push to PyPI
+- Initial push to PyPI
 
 ### 0.1.0 (2025-01-29)