Skip to content

Commit ac08320

Browse files
authored
Support re-indexed compound data (#196)
1 parent d8509aa commit ac08320

File tree

6 files changed

+79
-69
lines changed

6 files changed

+79
-69
lines changed

frontend/packages/@depmap/data-explorer-2/src/utils/misc.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export function getDimensionTypeLabel(dimension_type: string) {
1212
}
1313

1414
if (dimension_type === "compound_experiment") {
15-
return "compound";
15+
return "compound sample";
1616
}
1717

1818
if (dimension_type === "msigdb_gene_set") {

portal-backend/depmap/compound/views/executive.py

Lines changed: 73 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pandas as pd
1010

1111
from depmap import data_access
12+
from depmap.data_access.models import MatrixDataset
1213
from depmap.entity.views.executive import (
1314
format_enrichment_box_for_dataset,
1415
format_generic_distribution_plot,
@@ -28,7 +29,7 @@
2829
from depmap.correlation.utils import get_all_correlations
2930

3031
from depmap.dataset.models import BiomarkerDataset, DependencyDataset
31-
from depmap.compound.models import CompoundExperiment
32+
from depmap.compound.models import Compound, CompoundExperiment
3233
from depmap.predictability.models import PredictiveModel
3334

3435
from depmap.download.utils import get_download_url
@@ -42,31 +43,49 @@ class DataAvailabilityDataset:
4243
label: str
4344
dose_range: str
4445
assay: str
45-
dataset: DependencyEnum
46+
# There are multiple given IDs we may use to load the relevant dataset
47+
# If a re-indexed dataset exists in breadbox, that should be displayed.
48+
# Otherwise, just display the legacy version
49+
given_ids: list[str]
4650

4751

4852
# The set of information to show on the tile on the compound page
4953
data_availability_datasets = [
5054
DataAvailabilityDataset(
51-
"CTRP", "1nM - 10μM", "CellTitreGlo", DependencyEnum.CTRP_AUC
55+
label="CTRP",
56+
dose_range="1nM - 10μM",
57+
assay="CellTitreGlo",
58+
given_ids=["CTRP_AUC_collapsed", DependencyEnum.CTRP_AUC.name],
5259
),
5360
DataAvailabilityDataset(
54-
"GDSC1", "1nM - 10μM", "Resazurin or Syto60", DependencyEnum.GDSC1_AUC
61+
label="GDSC1",
62+
dose_range="1nM - 10μM",
63+
assay="Resazurin or Syto60",
64+
given_ids=["GDSC1_AUC_collapsed", DependencyEnum.GDSC1_AUC.name],
5565
),
5666
DataAvailabilityDataset(
57-
"GDSC2", "1nM - 10μM", "CellTitreGlo", DependencyEnum.GDSC2_AUC
67+
label="GDSC2",
68+
dose_range="1nM - 10μM",
69+
assay="CellTitreGlo",
70+
given_ids=["GDSC2_AUC_collapsed", DependencyEnum.GDSC2_AUC.name],
5871
),
5972
DataAvailabilityDataset(
60-
"Repurposing single point", "2.5μM", "PRISM", DependencyEnum.Rep_all_single_pt
73+
label="Repurposing single point",
74+
dose_range="2.5μM",
75+
assay="PRISM",
76+
given_ids=["REPURPOSING_AUC_collapsed", DependencyEnum.Rep_all_single_pt.name],
6177
),
6278
DataAvailabilityDataset(
63-
"Repurposing multi-dose",
64-
"1nM - 10μM",
65-
"PRISM",
66-
DependencyEnum.Repurposing_secondary_AUC,
79+
label="Repurposing multi-dose",
80+
dose_range="1nM - 10μM",
81+
assay="PRISM",
82+
given_ids=[DependencyEnum.Repurposing_secondary_AUC.name],
6783
),
6884
DataAvailabilityDataset(
69-
"OncRef", "1nM - 10μM", "PRISM", DependencyEnum.Prism_oncology_AUC
85+
label="OncRef",
86+
dose_range="1nM - 10μM",
87+
assay="PRISM",
88+
given_ids=["Prism_oncology_AUC_collapsed", DependencyEnum.Prism_oncology_AUC.name],
7089
),
7190
]
7291

@@ -260,64 +279,57 @@ def get_top_correlated_expression(compound_experiment_and_datasets):
260279
return top_correlations
261280

262281

263-
def format_availability_tile(compound_id):
264-
# find all the compound experiment IDs because the data is stored in the AUC matrices
265-
# indexed by compound_experiment, not compound_id
266-
compound_experiments_ids = [
267-
ce.entity_id for ce in CompoundExperiment.get_all_by_compound_id(compound_id)
268-
]
269-
282+
def format_availability_tile(compound: Compound):
283+
"""
284+
Load high-level information about which datasets the given compound
285+
appears in. This does NOT load the full list of datasets, but instead
286+
returns a curated subset that users are most interested in.
287+
For example, we want to show whether there is "Repurposing" data, but don't need
288+
to list all of the oncref datasets (AUC, IC50, etc.).
289+
"""
290+
compound_id = compound.compound_id
291+
# First, load ALL portal datasets containing the compound (for performance reasons).
292+
# This is faster than iterating through the datasets and checking their full contents one-by-one.
293+
all_compound_datasets = data_access.get_all_datasets_containing_compound(compound_id)
294+
datasets_with_compound_by_id = {}
295+
for dataset in all_compound_datasets:
296+
if dataset.given_id:
297+
datasets_with_compound_by_id[dataset.given_id] = dataset
298+
else:
299+
datasets_with_compound_by_id[dataset.id] = dataset
300+
301+
# Only return datasets which both 1) contain the compound and 2) exist in our hard-coded list
270302
results = []
271-
for data_availability_dataset in data_availability_datasets:
272-
cell_line_count = get_cell_line_count(
273-
data_availability_dataset.dataset, compound_experiments_ids
274-
)
275-
if cell_line_count == 0:
276-
continue
277-
dataset = DependencyDataset.get_dataset_by_name(
278-
data_availability_dataset.dataset.value
279-
)
280-
dataset_url = get_download_url(dataset.taiga_id)
281-
results.append(
282-
{
283-
"dataset_name": data_availability_dataset.label,
284-
"dose_range": data_availability_dataset.dose_range,
285-
"assay": data_availability_dataset.assay,
286-
"cell_lines": cell_line_count,
287-
"dataset_url": dataset_url,
288-
}
289-
)
303+
for dataset_config in data_availability_datasets:
304+
# Use the highest priority dataset that exists
305+
dataset: Optional[MatrixDataset] = None
306+
for given_id in dataset_config.given_ids:
307+
if dataset is None and given_id in datasets_with_compound_by_id:
308+
dataset = datasets_with_compound_by_id[given_id]
309+
310+
if dataset is not None:
311+
# Load data for this compound to determine how many cell lines have data for it
312+
df = data_access.get_subsetted_df_by_labels_compound_friendly(dataset.id)
313+
feature_data = df.loc[compound.label]
314+
cell_line_count = feature_data.dropna().size
315+
316+
dataset_url = get_download_url(dataset.taiga_id)
317+
results.append(
318+
{
319+
"dataset_name": dataset_config.label,
320+
"dose_range": dataset_config.dose_range,
321+
"assay": dataset_config.assay,
322+
"cell_lines": cell_line_count,
323+
"dataset_url": dataset_url,
324+
}
325+
)
290326

291327
# Currently no filtering needs to happen here because only one DependencyDataset
292328
# per dataset has both dose_range and assay in its corresponding metadata
293329
results.sort(key=lambda x: x["dataset_name"])
294330
return results
295331

296332

297-
def get_cell_line_count(dataset: DependencyEnum, entity_ids: List[int]):
298-
# given a set of entity_ids, return the number of cell lines which have
299-
# values for any of those entity_ids
300-
301-
if not data_access.has_config(dataset.value):
302-
return 0
303-
304-
# map entity_ids to row_indices
305-
row_summaries = data_access.get_all_row_indices_labels_entity_ids(dataset.value)
306-
row_index_by_entity_id = {x.entity_id: x.index for x in row_summaries}
307-
row_indices = []
308-
for entity_id in entity_ids:
309-
if entity_id in row_index_by_entity_id:
310-
row_indices.append(row_index_by_entity_id[entity_id])
311-
312-
# get the corresponding data
313-
df: pd.DataFrame = data_access.get_subsetted_df(
314-
dataset_id=dataset.value, row_indices=row_indices, col_indices=None
315-
)
316-
317-
# compute the number of columns which have at least one non-na
318-
return sum((~df.applymap(pd.isna)).apply(any, axis=0))
319-
320-
321333
def format_corr_table(compound_label, top_correlations):
322334
table = []
323335
for _, tc in top_correlations.items():

portal-backend/depmap/data_access/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@
3232
get_context_dataset,
3333
get_custom_cell_lines_dataset,
3434
has_config,
35-
)
35+
)

portal-backend/depmap/tile/views.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -606,11 +606,10 @@ def get_correlations_html(
606606
def get_availability_html(
607607
compound, compound_experiment_and_datasets, query_params_dict={}
608608
):
609-
compound_id = compound.entity_id
610609
return render_template(
611610
"tiles/availability.html",
612611
name=compound.label,
613-
availability=format_availability_tile(compound_id),
612+
availability=format_availability_tile(compound),
614613
)
615614

616615

portal-backend/pyright-ratchet-errors.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,6 @@ executive.py: error: "name" is not a known member of "None" (reportOptionalMembe
160160
executive.py: error: "num_dependent_cell_lines" is not a known member of "None" (reportOptionalMemberAccess)
161161
executive.py: error: "num_lines_with_data" is not a known member of "None" (reportOptionalMemberAccess)
162162
executive.py: error: "plot_param" is possibly unbound (reportPossiblyUnboundVariable)
163-
executive.py: error: "taiga_id" is not a known member of "None" (reportOptionalMemberAccess)
164163
executive.py: error: Argument of type "Literal['All']" cannot be assigned to parameter "__value" of type "int" in function "__setitem__"
165164
executive.py: error: Argument of type "str | Any" cannot be assigned to parameter "__value" of type "list[Unknown]" in function "__setitem__"
166165
executive.py: error: Cannot access member "barh" for type "ndarray[Any, dtype[Any]]"
@@ -438,7 +437,6 @@ test_executive.py: error: Argument of type "list[DependencyDatasetFactory]" cann
438437
test_executive.py: error: Argument of type "list[Unknown] | None" cannot be assigned to parameter "__obj" of type "Sized" in function "len"
439438
test_executive.py: error: Cannot access member "dataset_id" for type "DependencyDatasetFactory"
440439
test_executive.py: error: Cannot access member "entity_id" for type "CompoundExperimentFactory"
441-
test_executive.py: error: Cannot access member "entity_id" for type "CompoundFactory"
442440
test_executive.py: error: Cannot access member "units" for type "LazyAttribute"
443441
test_executive.py: error: Object of type "None" cannot be used as iterable value (reportOptionalIterable)
444442
test_executive.py: error: Object of type "None" is not subscriptable (reportOptionalSubscript)

portal-backend/tests/depmap/compound/views/test_executive.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
format_enrichment_boxes,
66
format_top_corr_table,
77
)
8+
from depmap.compound.models import Compound
89
from depmap.context.models import ContextEnrichment
910
from depmap.dataset.models import DependencyDataset
1011
from depmap.enums import BiomarkerEnum
@@ -139,7 +140,7 @@ def test_format_top_corr_table(tmpdir, empty_db_mock_downloads):
139140

140141

141142
def test_format_availability_tile(empty_db_mock_downloads):
142-
compound = CompoundFactory()
143+
compound: Compound = CompoundFactory() # pyright: ignore
143144
compound_experiment_1 = CompoundExperimentFactory(
144145
label="exp_label_1", compound=compound
145146
)
@@ -186,7 +187,7 @@ def test_format_availability_tile(empty_db_mock_downloads):
186187
"dataset_url": "/download/all/?release=test+name+version&file=test+file+name+2",
187188
},
188189
]
189-
availability = format_availability_tile(compound.entity_id)
190+
availability = format_availability_tile(compound)
190191

191192
assert expected == availability
192193

0 commit comments

Comments
 (0)