99import pandas as pd
1010
1111from depmap import data_access
12+ from depmap .data_access .models import MatrixDataset
1213from depmap .entity .views .executive import (
1314 format_enrichment_box_for_dataset ,
1415 format_generic_distribution_plot ,
2829from depmap .correlation .utils import get_all_correlations
2930
3031from depmap .dataset .models import BiomarkerDataset , DependencyDataset
31- from depmap .compound .models import CompoundExperiment
32+ from depmap .compound .models import Compound , CompoundExperiment
3233from depmap .predictability .models import PredictiveModel
3334
3435from depmap .download .utils import get_download_url
@@ -42,31 +43,49 @@ class DataAvailabilityDataset:
4243 label : str
4344 dose_range : str
4445 assay : str
45- dataset : DependencyEnum
46+ # There are multiple given IDs we may use to load the relevant dataset
47+ # If a re-indexed dataset exists in breadbox, that should be displayed.
48+ # Otherwise, just display the legacy version
49+ given_ids : list [str ]
4650
4751
4852# The set of information to show on the tile on the compound page
4953data_availability_datasets = [
5054 DataAvailabilityDataset (
51- "CTRP" , "1nM - 10μM" , "CellTitreGlo" , DependencyEnum .CTRP_AUC
55+ label = "CTRP" ,
56+ dose_range = "1nM - 10μM" ,
57+ assay = "CellTitreGlo" ,
58+ given_ids = ["CTRP_AUC_collapsed" , DependencyEnum .CTRP_AUC .name ],
5259 ),
5360 DataAvailabilityDataset (
54- "GDSC1" , "1nM - 10μM" , "Resazurin or Syto60" , DependencyEnum .GDSC1_AUC
61+ label = "GDSC1" ,
62+ dose_range = "1nM - 10μM" ,
63+ assay = "Resazurin or Syto60" ,
64+ given_ids = ["GDSC1_AUC_collapsed" , DependencyEnum .GDSC1_AUC .name ],
5565 ),
5666 DataAvailabilityDataset (
57- "GDSC2" , "1nM - 10μM" , "CellTitreGlo" , DependencyEnum .GDSC2_AUC
67+ label = "GDSC2" ,
68+ dose_range = "1nM - 10μM" ,
69+ assay = "CellTitreGlo" ,
70+ given_ids = ["GDSC2_AUC_collapsed" , DependencyEnum .GDSC2_AUC .name ],
5871 ),
5972 DataAvailabilityDataset (
60- "Repurposing single point" , "2.5μM" , "PRISM" , DependencyEnum .Rep_all_single_pt
73+ label = "Repurposing single point" ,
74+ dose_range = "2.5μM" ,
75+ assay = "PRISM" ,
76+ given_ids = ["REPURPOSING_AUC_collapsed" , DependencyEnum .Rep_all_single_pt .name ],
6177 ),
6278 DataAvailabilityDataset (
63- "Repurposing multi-dose" ,
64- "1nM - 10μM" ,
65- "PRISM" ,
66- DependencyEnum .Repurposing_secondary_AUC ,
79+ label = "Repurposing multi-dose" ,
80+ dose_range = "1nM - 10μM" ,
81+ assay = "PRISM" ,
82+ given_ids = [ DependencyEnum .Repurposing_secondary_AUC . name ] ,
6783 ),
6884 DataAvailabilityDataset (
69- "OncRef" , "1nM - 10μM" , "PRISM" , DependencyEnum .Prism_oncology_AUC
85+ label = "OncRef" ,
86+ dose_range = "1nM - 10μM" ,
87+ assay = "PRISM" ,
88+ given_ids = ["Prism_oncology_AUC_collapsed" , DependencyEnum .Prism_oncology_AUC .name ],
7089 ),
7190]
7291
@@ -260,64 +279,57 @@ def get_top_correlated_expression(compound_experiment_and_datasets):
260279 return top_correlations
261280
262281
263- def format_availability_tile (compound_id ):
264- # find all the compound experiment IDs because the data is stored in the AUC matrices
265- # indexed by compound_experiment, not compound_id
266- compound_experiments_ids = [
267- ce .entity_id for ce in CompoundExperiment .get_all_by_compound_id (compound_id )
268- ]
269-
282+ def format_availability_tile (compound : Compound ):
283+ """
284+ Load high-level information about which datasets the given compound
285+ appears in. This does NOT load the full list of datasets, but instead
286+ returns a curated subset that users are most interested in.
287+ For example, we want to show whether there is "Repurposing" data, but don't need
288+ to list all of the oncref datasets (AUC, IC50, etc.).
289+ """
290+ compound_id = compound .compound_id
291+ # First, load ALL portal datasets containing the compound (for performance reasons).
292+ # This is faster than iterating through the datasets and checking their full contents one-by-one.
293+ all_compound_datasets = data_access .get_all_datasets_containing_compound (compound_id )
294+ datasets_with_compound_by_id = {}
295+ for dataset in all_compound_datasets :
296+ if dataset .given_id :
297+ datasets_with_compound_by_id [dataset .given_id ] = dataset
298+ else :
299+ datasets_with_compound_by_id [dataset .id ] = dataset
300+
301+ # Only return datasets which both 1) contain the compound and 2) exist in our hard-coded list
270302 results = []
271- for data_availability_dataset in data_availability_datasets :
272- cell_line_count = get_cell_line_count (
273- data_availability_dataset .dataset , compound_experiments_ids
274- )
275- if cell_line_count == 0 :
276- continue
277- dataset = DependencyDataset .get_dataset_by_name (
278- data_availability_dataset .dataset .value
279- )
280- dataset_url = get_download_url (dataset .taiga_id )
281- results .append (
282- {
283- "dataset_name" : data_availability_dataset .label ,
284- "dose_range" : data_availability_dataset .dose_range ,
285- "assay" : data_availability_dataset .assay ,
286- "cell_lines" : cell_line_count ,
287- "dataset_url" : dataset_url ,
288- }
289- )
303+ for dataset_config in data_availability_datasets :
304+ # Use the highest priority dataset that exists
305+ dataset : Optional [MatrixDataset ] = None
306+ for given_id in dataset_config .given_ids :
307+ if dataset is None and given_id in datasets_with_compound_by_id :
308+ dataset = datasets_with_compound_by_id [given_id ]
309+
310+ if dataset is not None :
311+ # Load data for this compound to determine how many cell lines have data for it
312+ df = data_access .get_subsetted_df_by_labels_compound_friendly (dataset .id )
313+ feature_data = df .loc [compound .label ]
314+ cell_line_count = feature_data .dropna ().size
315+
316+ dataset_url = get_download_url (dataset .taiga_id )
317+ results .append (
318+ {
319+ "dataset_name" : dataset_config .label ,
320+ "dose_range" : dataset_config .dose_range ,
321+ "assay" : dataset_config .assay ,
322+ "cell_lines" : cell_line_count ,
323+ "dataset_url" : dataset_url ,
324+ }
325+ )
290326
291327 # Currently no filtering needs to happen here because only one DependencyDataset
292328 # per dataset has both dose_range and assay in its corresponding metadata
293329 results .sort (key = lambda x : x ["dataset_name" ])
294330 return results
295331
296332
297- def get_cell_line_count (dataset : DependencyEnum , entity_ids : List [int ]):
298- # given a set of entity_ids, return the number of cell lines which have
299- # values for any of those entity_ids
300-
301- if not data_access .has_config (dataset .value ):
302- return 0
303-
304- # map entity_ids to row_indices
305- row_summaries = data_access .get_all_row_indices_labels_entity_ids (dataset .value )
306- row_index_by_entity_id = {x .entity_id : x .index for x in row_summaries }
307- row_indices = []
308- for entity_id in entity_ids :
309- if entity_id in row_index_by_entity_id :
310- row_indices .append (row_index_by_entity_id [entity_id ])
311-
312- # get the corresponding data
313- df : pd .DataFrame = data_access .get_subsetted_df (
314- dataset_id = dataset .value , row_indices = row_indices , col_indices = None
315- )
316-
317- # compute the number of columns which have at least one non-na
318- return sum ((~ df .applymap (pd .isna )).apply (any , axis = 0 ))
319-
320-
321333def format_corr_table (compound_label , top_correlations ):
322334 table = []
323335 for _ , tc in top_correlations .items ():
0 commit comments