Support re-indexed compound data (#196)

snwessel · web-flow · commit ac08320b1631 · 2025-03-12T12:02:42.000-04:00
diff --git a/frontend/packages/@depmap/data-explorer-2/src/utils/misc.ts b/frontend/packages/@depmap/data-explorer-2/src/utils/misc.ts
@@ -12,7 +12,7 @@ export function getDimensionTypeLabel(dimension_type: string) {
   }
 
   if (dimension_type === "compound_experiment") {
-    return "compound";
+    return "compound sample";
   }
 
   if (dimension_type === "msigdb_gene_set") {
diff --git a/portal-backend/depmap/compound/views/executive.py b/portal-backend/depmap/compound/views/executive.py
@@ -9,6 +9,7 @@
 import pandas as pd
 
 from depmap import data_access
+from depmap.data_access.models import MatrixDataset
 from depmap.entity.views.executive import (
     format_enrichment_box_for_dataset,
     format_generic_distribution_plot,
@@ -28,7 +29,7 @@
 from depmap.correlation.utils import get_all_correlations
 
 from depmap.dataset.models import BiomarkerDataset, DependencyDataset
-from depmap.compound.models import CompoundExperiment
+from depmap.compound.models import Compound, CompoundExperiment
 from depmap.predictability.models import PredictiveModel
 
 from depmap.download.utils import get_download_url
@@ -42,31 +43,49 @@ class DataAvailabilityDataset:
     label: str
     dose_range: str
     assay: str
-    dataset: DependencyEnum
+    # There are multiple given IDs we may use to load the relevant dataset 
+    # If a re-indexed dataset exists in breadbox, that should be displayed.
+    # Otherwise, just display the legacy version
+    given_ids: list[str]
 
 
 # The set of information to show on the tile on the compound page
 data_availability_datasets = [
     DataAvailabilityDataset(
-        "CTRP", "1nM - 10μM", "CellTitreGlo", DependencyEnum.CTRP_AUC
+        label="CTRP", 
+        dose_range="1nM - 10μM", 
+        assay="CellTitreGlo", 
+        given_ids=["CTRP_AUC_collapsed", DependencyEnum.CTRP_AUC.name],
     ),
     DataAvailabilityDataset(
-        "GDSC1", "1nM - 10μM", "Resazurin or Syto60", DependencyEnum.GDSC1_AUC
+        label="GDSC1", 
+        dose_range="1nM - 10μM", 
+        assay="Resazurin or Syto60", 
+        given_ids=["GDSC1_AUC_collapsed", DependencyEnum.GDSC1_AUC.name],
     ),
     DataAvailabilityDataset(
-        "GDSC2", "1nM - 10μM", "CellTitreGlo", DependencyEnum.GDSC2_AUC
+        label="GDSC2", 
+        dose_range="1nM - 10μM", 
+        assay="CellTitreGlo", 
+        given_ids=["GDSC2_AUC_collapsed", DependencyEnum.GDSC2_AUC.name],
     ),
     DataAvailabilityDataset(
-        "Repurposing single point", "2.5μM", "PRISM", DependencyEnum.Rep_all_single_pt
+        label="Repurposing single point", 
+        dose_range="2.5μM", 
+        assay="PRISM", 
+        given_ids=["REPURPOSING_AUC_collapsed", DependencyEnum.Rep_all_single_pt.name],
     ),
     DataAvailabilityDataset(
-        "Repurposing multi-dose",
-        "1nM - 10μM",
-        "PRISM",
-        DependencyEnum.Repurposing_secondary_AUC,
+        label="Repurposing multi-dose",
+        dose_range="1nM - 10μM",
+        assay="PRISM",
+        given_ids=[DependencyEnum.Repurposing_secondary_AUC.name],
     ),
     DataAvailabilityDataset(
-        "OncRef", "1nM - 10μM", "PRISM", DependencyEnum.Prism_oncology_AUC
+        label="OncRef",
+        dose_range="1nM - 10μM", 
+        assay="PRISM", 
+        given_ids=["Prism_oncology_AUC_collapsed", DependencyEnum.Prism_oncology_AUC.name],
     ),
 ]
 
@@ -260,64 +279,57 @@ def get_top_correlated_expression(compound_experiment_and_datasets):
     return top_correlations
 
 
-def format_availability_tile(compound_id):
-    # find all the compound experiment IDs because the data is stored in the AUC matrices
-    # indexed by compound_experiment, not compound_id
-    compound_experiments_ids = [
-        ce.entity_id for ce in CompoundExperiment.get_all_by_compound_id(compound_id)
-    ]
-
+def format_availability_tile(compound: Compound):
+    """
+    Load high-level information about which datasets the given compound
+    appears in. This does NOT load the full list of datasets, but instead
+    returns a curated subset that users are most interested in. 
+    For example, we want to show whether there is "Repurposing" data, but don't need
+    to list all of the oncref datasets (AUC, IC50, etc.).
+    """
+    compound_id = compound.compound_id
+    # First, load ALL portal datasets containing the compound (for performance reasons).
+    # This is faster than iterating through the datasets and checking their full contents one-by-one.
+    all_compound_datasets = data_access.get_all_datasets_containing_compound(compound_id)
+    datasets_with_compound_by_id = {}
+    for dataset in all_compound_datasets:
+        if dataset.given_id:
+            datasets_with_compound_by_id[dataset.given_id] = dataset
+        else:
+            datasets_with_compound_by_id[dataset.id] = dataset
+
+    # Only return datasets which both 1) contain the compound and 2) exist in our hard-coded list
     results = []
-    for data_availability_dataset in data_availability_datasets:
-        cell_line_count = get_cell_line_count(
-            data_availability_dataset.dataset, compound_experiments_ids
-        )
-        if cell_line_count == 0:
-            continue
-        dataset = DependencyDataset.get_dataset_by_name(
-            data_availability_dataset.dataset.value
-        )
-        dataset_url = get_download_url(dataset.taiga_id)
-        results.append(
-            {
-                "dataset_name": data_availability_dataset.label,
-                "dose_range": data_availability_dataset.dose_range,
-                "assay": data_availability_dataset.assay,
-                "cell_lines": cell_line_count,
-                "dataset_url": dataset_url,
-            }
-        )
+    for dataset_config in data_availability_datasets:
+        # Use the highest priority dataset that exists
+        dataset: Optional[MatrixDataset] = None
+        for given_id in dataset_config.given_ids:
+            if dataset is None and given_id in datasets_with_compound_by_id:
+                dataset = datasets_with_compound_by_id[given_id]
+
+        if dataset is not None:
+            # Load data for this compound to determine how many cell lines have data for it
+            df = data_access.get_subsetted_df_by_labels_compound_friendly(dataset.id)
+            feature_data = df.loc[compound.label]
+            cell_line_count = feature_data.dropna().size
+
+            dataset_url = get_download_url(dataset.taiga_id)
+            results.append(
+                {
+                    "dataset_name": dataset_config.label,
+                    "dose_range": dataset_config.dose_range,
+                    "assay": dataset_config.assay,
+                    "cell_lines": cell_line_count,
+                    "dataset_url": dataset_url,
+                }
+            )
 
     # Currently no filtering needs to happen here because only one DependencyDataset
     # per dataset has both dose_range and assay in its corresponding metadata
     results.sort(key=lambda x: x["dataset_name"])
     return results
 
 
-def get_cell_line_count(dataset: DependencyEnum, entity_ids: List[int]):
-    # given a set of entity_ids, return the number of cell lines which have
-    # values for any of those entity_ids
-
-    if not data_access.has_config(dataset.value):
-        return 0
-
-    # map entity_ids to row_indices
-    row_summaries = data_access.get_all_row_indices_labels_entity_ids(dataset.value)
-    row_index_by_entity_id = {x.entity_id: x.index for x in row_summaries}
-    row_indices = []
-    for entity_id in entity_ids:
-        if entity_id in row_index_by_entity_id:
-            row_indices.append(row_index_by_entity_id[entity_id])
-
-    # get the corresponding data
-    df: pd.DataFrame = data_access.get_subsetted_df(
-        dataset_id=dataset.value, row_indices=row_indices, col_indices=None
-    )
-
-    # compute the number of columns which have at least one non-na
-    return sum((~df.applymap(pd.isna)).apply(any, axis=0))
-
-
 def format_corr_table(compound_label, top_correlations):
     table = []
     for _, tc in top_correlations.items():
diff --git a/portal-backend/depmap/data_access/__init__.py b/portal-backend/depmap/data_access/__init__.py
@@ -32,4 +32,4 @@
     get_context_dataset,
     get_custom_cell_lines_dataset,
     has_config,
-)
+)
diff --git a/portal-backend/depmap/tile/views.py b/portal-backend/depmap/tile/views.py
@@ -606,11 +606,10 @@ def get_correlations_html(
 def get_availability_html(
     compound, compound_experiment_and_datasets, query_params_dict={}
 ):
-    compound_id = compound.entity_id
     return render_template(
         "tiles/availability.html",
         name=compound.label,
-        availability=format_availability_tile(compound_id),
+        availability=format_availability_tile(compound),
     )
 
 
diff --git a/portal-backend/pyright-ratchet-errors.txt b/portal-backend/pyright-ratchet-errors.txt
@@ -160,7 +160,6 @@ executive.py: error: "name" is not a known member of "None" (reportOptionalMembe
 executive.py: error: "num_dependent_cell_lines" is not a known member of "None" (reportOptionalMemberAccess)
 executive.py: error: "num_lines_with_data" is not a known member of "None" (reportOptionalMemberAccess)
 executive.py: error: "plot_param" is possibly unbound (reportPossiblyUnboundVariable)
-executive.py: error: "taiga_id" is not a known member of "None" (reportOptionalMemberAccess)
 executive.py: error: Argument of type "Literal['All']" cannot be assigned to parameter "__value" of type "int" in function "__setitem__"
 executive.py: error: Argument of type "str | Any" cannot be assigned to parameter "__value" of type "list[Unknown]" in function "__setitem__"
 executive.py: error: Cannot access member "barh" for type "ndarray[Any, dtype[Any]]"
@@ -438,7 +437,6 @@ test_executive.py: error: Argument of type "list[DependencyDatasetFactory]" cann
 test_executive.py: error: Argument of type "list[Unknown] | None" cannot be assigned to parameter "__obj" of type "Sized" in function "len"
 test_executive.py: error: Cannot access member "dataset_id" for type "DependencyDatasetFactory"
 test_executive.py: error: Cannot access member "entity_id" for type "CompoundExperimentFactory"
-test_executive.py: error: Cannot access member "entity_id" for type "CompoundFactory"
 test_executive.py: error: Cannot access member "units" for type "LazyAttribute"
 test_executive.py: error: Object of type "None" cannot be used as iterable value (reportOptionalIterable)
 test_executive.py: error: Object of type "None" is not subscriptable (reportOptionalSubscript)
diff --git a/portal-backend/tests/depmap/compound/views/test_executive.py b/portal-backend/tests/depmap/compound/views/test_executive.py
@@ -5,6 +5,7 @@
     format_enrichment_boxes,
     format_top_corr_table,
 )
+from depmap.compound.models import Compound
 from depmap.context.models import ContextEnrichment
 from depmap.dataset.models import DependencyDataset
 from depmap.enums import BiomarkerEnum
@@ -139,7 +140,7 @@ def test_format_top_corr_table(tmpdir, empty_db_mock_downloads):
 
 
 def test_format_availability_tile(empty_db_mock_downloads):
-    compound = CompoundFactory()
+    compound: Compound = CompoundFactory() # pyright: ignore
     compound_experiment_1 = CompoundExperimentFactory(
         label="exp_label_1", compound=compound
     )
@@ -186,7 +187,7 @@ def test_format_availability_tile(empty_db_mock_downloads):
             "dataset_url": "/download/all/?release=test+name+version&file=test+file+name+2",
         },
     ]
-    availability = format_availability_tile(compound.entity_id)
+    availability = format_availability_tile(compound)
 
     assert expected == availability
 

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ export function getDimensionTypeLabel(dimension_type: string) {`
`12`	`12`	`}`
`13`	`13`
`14`	`14`	`if (dimension_type === "compound_experiment") {`
`15`		`- return "compound";`
	`15`	`+ return "compound sample";`
`16`	`16`	`}`
`17`	`17`
`18`	`18`	`if (dimension_type === "msigdb_gene_set") {`