Add tests for the CellAnnotator class

Marius1311 · Marius1311 · commit 362817d0c124 · 2025-01-27T17:45:54.000+01:00
diff --git a/tests/test_base_annotator.py b/tests/test_base_annotator.py
@@ -10,9 +10,7 @@
 class TestBaseAnnotator:
     @pytest.fixture
     def base_annotator(self):
-        return BaseAnnotator(
-            species="human", tissue="brain", stage="adult", cluster_key="leiden", model="gpt-4o-mini", max_tokens=100
-        )
+        return BaseAnnotator(species="human", tissue="brain", stage="adult", cluster_key="leiden", model="gpt-4o-mini")
 
     @patch("cell_annotator.base_annotator.BaseAnnotator.query_openai")
     def test_query_openai(self, mock_query_openai, base_annotator):
diff --git a/tests/test_cell_annotator.py b/tests/test_cell_annotator.py
@@ -0,0 +1,80 @@
+import pytest
+
+from cell_annotator.cell_annotator import CellAnnotator
+
+from .utils import expected_marker_genes, fibroblast_cell_types, get_example_data, neuronal_cell_types
+
+
+class TestCellAnnotator:
+    @pytest.fixture
+    def cell_annotator(self):
+        adata = get_example_data(n_cells=200, n_samples=2)
+
+        return CellAnnotator(
+            adata=adata,
+            species="human",
+            tissue="In vitro neurons and fibroblasts",
+            stage="adult",
+            cluster_key="leiden",
+            sample_key="sample",
+            model="gpt-4o-mini",
+        )
+
+    @pytest.mark.openai()
+    def test_get_expected_cell_type_markers(self, cell_annotator):
+        cell_annotator.get_expected_cell_type_markers()
+        expected_markers = cell_annotator.expected_marker_genes
+        print("Expected Markers:", expected_markers)
+
+        assert expected_markers is not None
+        assert isinstance(expected_markers, dict)
+
+        neuron_markers_found = any(
+            any(neuron_synonym in key for neuron_synonym in neuronal_cell_types)
+            and set(expected_marker_genes["Neuron"]).intersection(expected_markers[key])
+            for key in expected_markers
+        )
+        fibroblast_markers_found = any(
+            any(fibroblast_synonym in key for fibroblast_synonym in fibroblast_cell_types)
+            and set(expected_marker_genes["Fibroblast"]).intersection(expected_markers[key])
+            for key in expected_markers
+        )
+
+        assert neuron_markers_found
+        assert fibroblast_markers_found
+
+    @pytest.mark.openai()
+    def test_annotate_clusters(self, cell_annotator):
+        # Step 1: Call get_cluster_markers and run checks
+        cell_annotator.get_cluster_markers(min_auc=0.6)
+
+        for sample in cell_annotator.sample_annotators.values():
+            assert sample.marker_gene_dfs is not None
+            assert sample.marker_genes is not None
+
+            for _cluster, df in sample.marker_gene_dfs.items():
+                assert not df.empty
+                assert "gene" in df.columns
+                assert "specificity" in df.columns
+                assert "auc" in df.columns
+
+            for _cluster, genes in sample.marker_genes.items():
+                assert len(genes) > 0
+
+        # Step 2: Call annotate_clusters and run checks
+        cell_annotator.expected_marker_genes = expected_marker_genes
+        cell_annotator.annotate_clusters(min_markers=1)
+
+        for sample in cell_annotator.sample_annotators.values():
+            print("Sample Annotation:\n", sample.annotation_df[["n_cells", "cell_type"]])
+
+            neuron_annotation_found = any(
+                neuron_synonym in sample.annotation_dict["0"].cell_type for neuron_synonym in neuronal_cell_types
+            )
+            fibroblast_annotation_found = any(
+                fibroblast_synonym in sample.annotation_dict["1"].cell_type
+                for fibroblast_synonym in fibroblast_cell_types
+            )
+
+            assert neuron_annotation_found
+            assert fibroblast_annotation_found
diff --git a/tests/test_sample_annotator.py b/tests/test_sample_annotator.py
@@ -1,53 +1,12 @@
 from unittest.mock import patch
 
-import numpy as np
 import pandas as pd
 import pytest
-import scanpy as sc
-from anndata import AnnData
-from numpy.random import binomial, negative_binomial
 
 from cell_annotator._response_formats import CellTypeMappingOutput, PredictedCellTypeOutput
 from cell_annotator.sample_annotator import SampleAnnotator
 
-# Declare the dictionary of expected marker genes
-expected_marker_genes = {
-    "Neuron": ["MAP2", "NEFL", "RBFOX3", "SYN1", "GAP43", "DCX", "TUBB3", "NEUROD1", "STMN2", "ENO2"],
-    "Fibroblast": ["COL1A1", "COL3A1", "VIM", "ACTA2", "FAP", "PDGFRA", "THY1", "FN1", "SPARC", "S100A4"],
-}
-
-
-def get_example_data(n_cells: int = 100) -> AnnData:
-    """Create example data for testing. Adapted from scanpy.
-
-    The data consists of two clusters with different marker genes. The first cluster is enriched for neuronal markers and the second cluster is enriched for fibroblast markers."""
-    gene_names = expected_marker_genes["Neuron"] + expected_marker_genes["Fibroblast"]
-    n_genes = len(gene_names)
-    adata = AnnData(np.multiply(binomial(1, 0.15, (n_cells, n_genes)), negative_binomial(2, 0.25, (n_cells, n_genes))))
-    adata.var_names = gene_names
-
-    # Create marker genes for the two clusters
-    n_group_1 = np.floor(0.3 * n_cells).astype(int)
-    n_group_2 = n_cells - n_group_1
-    n_marker_genes = int(n_genes / 2)
-
-    adata.X[:n_group_1, :10] = np.multiply(
-        binomial(1, 0.9, (n_group_1, n_marker_genes)), negative_binomial(1, 0.5, (n_group_1, n_marker_genes))
-    )
-    adata.X[n_group_1:, 10:] = np.multiply(
-        binomial(1, 0.9, (n_group_2, n_marker_genes)), negative_binomial(1, 0.5, (n_group_2, n_marker_genes))
-    )
-
-    # Create cluster according to groups
-    adata.obs["leiden"] = pd.Categorical(np.concatenate((n_group_1 * ["0"], n_group_2 * ["1"])))
-
-    # filter, normalize and log transform the data
-    sc.pp.filter_cells(adata, min_counts=2)
-    adata.raw = adata.copy()
-    sc.pp.normalize_total(adata, target_sum=1e4)
-    sc.pp.log1p(adata)
-
-    return adata
+from .utils import expected_marker_genes, fibroblast_cell_types, get_example_data, neuronal_cell_types
 
 
 class TestSampleAnnotator:
@@ -57,9 +16,9 @@ def sample_annotator(self):
 
         return SampleAnnotator(
             adata=adata,
-            sample_name="sample1",
+            sample_name="sample_1",
             species="human",
-            tissue="brain",
+            tissue="In vitro neurons and fibroblasts",
             stage="adult",
             cluster_key="leiden",
             model="gpt-4o-mini",
@@ -112,5 +71,13 @@ def test_annotate_clusters_actual(self, sample_annotator):
         sample_annotator.get_cluster_markers(min_auc=0.6)
         sample_annotator.annotate_clusters(min_markers=1, expected_marker_genes=expected_marker_genes)
 
-        assert sample_annotator.annotation_dict["0"].cell_type == "Neuron"
-        assert sample_annotator.annotation_dict["1"].cell_type == "Fibroblast"
+        neuron_annotation_found = any(
+            neuron_synonym in sample_annotator.annotation_dict["0"].cell_type for neuron_synonym in neuronal_cell_types
+        )
+        fibroblast_annotation_found = any(
+            fibroblast_synonym in sample_annotator.annotation_dict["1"].cell_type
+            for fibroblast_synonym in fibroblast_cell_types
+        )
+
+        assert neuron_annotation_found
+        assert fibroblast_annotation_found
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -26,7 +26,7 @@ def setup_data():
     clust_mask = np.array([True, False, True, False])
 
     # Create raw count data with gene names
-    raw_counts = np.array([[1, 0], [0, 1], [1, 1], [0, 0]])
+    raw_counts = np.array([[1, 0], [0, 1], [1, 1], [4, 0]])
     adata = sc.AnnData(X=raw_counts, var=pd.DataFrame(index=genes))
     adata.raw = adata.copy()  # Set raw data
 
@@ -119,7 +119,7 @@ def test_query_openai(self, MockOpenAI):
         assert response.parsed_response == "parsed_response"
         mock_client.beta.chat.completions.parse.assert_called_once()
 
-    @pytest.mark.opanai()
+    @pytest.mark.openai()
     def test_query_openai_actual(self):
         response = _query_openai(
             agent_description="Test agent",
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from anndata import AnnData
+from numpy.random import binomial, negative_binomial
+
+# Declare the dictionary of expected marker genes
+expected_marker_genes = {
+    "Neuron": ["MAP2", "NEFL", "RBFOX3", "SYN1", "GAP43", "DCX", "TUBB3", "NEUROD1", "STMN2", "ENO2"],
+    "Fibroblast": ["COL1A1", "COL3A1", "VIM", "ACTA2", "FAP", "PDGFRA", "THY1", "FN1", "SPARC", "S100A4"],
+}
+
+# Declare the neuronal and fibroblast cell types
+neuronal_cell_types = ["Neuron", "Neurons", "Neuronal cells", "neurons"]
+fibroblast_cell_types = ["Fibroblast", "Fibroblasts", "fibroblast cells"]
+
+
+def get_example_data(n_cells: int = 100, n_samples: int = 1) -> AnnData:
+    """Create example data for testing. Adapted from scanpy.
+
+    The data consists of two clusters with different marker genes. The first cluster is enriched for neuronal markers and the second cluster is enriched for fibroblast markers."""
+    gene_names = expected_marker_genes["Neuron"] + expected_marker_genes["Fibroblast"]
+    n_genes = len(gene_names)
+    adata = AnnData(np.multiply(binomial(1, 0.15, (n_cells, n_genes)), negative_binomial(2, 0.25, (n_cells, n_genes))))
+    adata.var_names = gene_names
+
+    # Create marker genes for the two clusters
+    n_group_1 = np.floor(0.3 * n_cells).astype(int)
+    n_group_2 = n_cells - n_group_1
+    n_marker_genes = int(n_genes / 2)
+
+    adata.X[:n_group_1, :10] = np.multiply(
+        binomial(1, 0.9, (n_group_1, n_marker_genes)), negative_binomial(1, 0.5, (n_group_1, n_marker_genes))
+    )
+    adata.X[n_group_1:, 10:] = np.multiply(
+        binomial(1, 0.9, (n_group_2, n_marker_genes)), negative_binomial(1, 0.5, (n_group_2, n_marker_genes))
+    )
+
+    # Create cluster according to groups
+    adata.obs["leiden"] = pd.Categorical(np.concatenate((n_group_1 * ["0"], n_group_2 * ["1"])))
+
+    # Add sample information if there are multiple samples
+    if n_samples > 1:
+        samples = np.random.choice([f"sample_{i}" for i in range(n_samples)], size=n_cells)
+        adata.obs["sample"] = samples
+
+    # filter, normalize and log transform the data
+    sc.pp.filter_cells(adata, min_counts=2)
+    adata.raw = adata.copy()
+    sc.pp.normalize_total(adata, target_sum=1e4)
+    sc.pp.log1p(adata)
+
+    return adata