broadinstitute
diff --git a/‎breadbox/Dockerfile‎
Lines changed: 3 additions & 0 deletions b/‎breadbox/Dockerfile‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎breadbox/breadbox/depmap_compute_embed/context.py‎
Lines changed: 18 additions & 99 deletions b/‎breadbox/breadbox/depmap_compute_embed/context.py‎
Lines changed: 18 additions & 99 deletions
diff --git a/‎breadbox/breadbox/service/associations.py‎
Lines changed: 53 additions & 54 deletions b/‎breadbox/breadbox/service/associations.py‎
Lines changed: 53 additions & 54 deletions
diff --git a/‎breadbox/breadbox/service/search.py‎
Lines changed: 7 additions & 1 deletion b/‎breadbox/breadbox/service/search.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎breadbox/breadbox/utils/profiling.py‎
Lines changed: 22 additions & 0 deletions b/‎breadbox/breadbox/utils/profiling.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎breadbox/build-docker-image.sh‎
Lines changed: 8 additions & 1 deletion b/‎breadbox/build-docker-image.sh‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎breadbox/poetry.lock‎
Lines changed: 4 additions & 4 deletions b/‎breadbox/poetry.lock‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎breadbox/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎breadbox/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -34,6 +34,9 @@ RUN pyenv install ${PYTHON_VERSION} && \
 # Create the installation directory
 RUN mkdir -p /install/breadbox
 
+# Copy in a file containing the git SHA (created in build-docker-image.sh) 
+COPY git-sha /install/git-sha
+
 # Copy files:
 COPY . /install/breadbox
 WORKDIR /install/breadbox
 
@@ -41,9 +41,15 @@ def __init__(
     ):
         """
         A `context` dict should have:
-            - a `dimension_type` such as "depmap_model" (which is not used in this evaluator)
             - an `expr` such as { "==": [ { "var": "var1" }, "Breast" ] }
-            - a set of `vars`, each of which assigns a name to a slice query
+            - a dictionary of `vars` which assigns names to slice queries, such as
+              {
+                  "var1": {
+                      "dataset_id": "depmap_model_metadata",
+                      "identifier": "OncotreeLineage",
+                      "identifier_type": "column"
+                  }
+              }
         """
         self.expr = _encode_dots_in_vars(context["expr"])
         self.slice_query_vars = _escape_dots(context.get("vars", {}))
@@ -52,8 +58,8 @@ def __init__(
         self.get_slice_data = get_slice_data
 
         # The cache is used so that slice values only need to be looked up once per context.
-        # - The keys in this dictionary are slice queries encoded as a tuple
-        #   (ex. `("Chronos_combined", "SOX10", "feature_label")`).
+        # - The keys in this dictionary match the values of an any { "var": "<var_name>" }
+        # expressions (and therefore should also match the keys of context["vars"]).
         # - The values are each an entire dictionary of slice values (indexed by given ID)
         self.cache = {}
 
@@ -135,104 +141,13 @@ def __bool__(self):
         return True
 
 
-class LegacyContextEvaluator:
-    """
-    DEPRECATED: Use `ContextEvaluator` for future development.
-    This older version has a few differences from the new one:
-    - Slices are specified using string slice IDs instead of slice queries
-    - Matching on index is done with a field called "entity_label". For features, 
-    this is expected to match the feature label. For samples, this matches on sample ID (not label).
-    This confusing behavior is part of why we're deprecating the old version. 
-    - the field "context_type" is used to specify the dimension type
-    """
-
-    def __init__(self, context: dict, get_slice_data: Callable[[str], dict[str, Any]]):
-        """
-        A `context` dict should have:
-            - a `context_type` such as "depmap_model"
-            - an `expr` such as { "==": [ { "var": "slice/lineage/1/label" }, "Breast" ] }
-        """
-        self.context_type = context["context_type"]
-        self.expr = _encode_dots_in_vars(context["expr"])
-        # Cache is keyed by Slice ID. Each value is an entire dictionary of slice values.
-        self.cache = {}
-        self.get_slice_data = get_slice_data
-
-    def is_match(self, dimension_label: str):
-        """
-        This evaluates `expr` against a given `dimension_label`. It returns
-        True/False depending on if `dimension_label` satifies the conditions of
-        the expression, including any variables ("var" subexpressions) which
-        are bound by using a magic dict that does lookups lazily.
-        """
-        dictionary_override = _LegacyLazyContextDict(
-            self.context_type, dimension_label, self.cache, self.get_slice_data
-        )
-
-        try:
-            return jsonLogic(self.expr, dictionary_override)
-        except (TypeError, ValueError) as e:
-            print("Exception evaluating", self.expr, "against", dimension_label)
-            print(e)
-            return False
-
-
-class _LegacyLazyContextDict(dict):
-    """
-    The JsonLogic library wants to be passed a dictionary of values. However, we need to 
-    inject our own special cases and caching, so we override the dictionary class with 
-    special functionality. Interesting thread on overriding the Dict class:
-    https://stackoverflow.com/questions/3387691/how-to-perfectly-override-a-dict
-    But we don't need to "perfectly" override it; just well enough to trick the JsonLogic library.
-
-    This implementation uses and updates the cache that's been passed in to the constructor.
-    """
-
-    def __init__(
-        self,
-        context_type: str,
-        dimension_label: str,
-        cache: dict,
-        get_slice_data: Callable[[str], dict[str, Any]],
-    ):
-        self.context_type = context_type
-        self.dimension_label = dimension_label
-        self.cache = cache
-        self.get_slice_data = get_slice_data
-
-    def __getitem__(self, prop):
-        """
-        Given a slice ID, get the slice value which corresponds to the 
-        "dimension_label" that's already been passed into the constructor of this class.
-        """
-        # Handle trivial case where we're just looking up a dimension's own
-        # label. Note that this is called "entity_label" for historical
-        # reasons.
-        if prop == "entity_label":
-            return self.dimension_label
-
-        if prop.startswith("slice/"):
-            if prop not in self.cache:
-                self.cache[prop] = self.get_slice_data(prop)
-
-            return self.cache[prop][self.dimension_label]
-
-        raise LookupError(
-            f"Unable to find context property '{prop}'. Are you sure a corresponding "
-            f"dataset exists and can be looked up by {self.context_type}?"
-        )
-
-    # We don't want our virtual dictionary to appear empty.
-    # Otherwise, the JsonLogic library will stomp it out with an empty default dict:
-    # https://github.com/nadirizr/json-logic-py/blob/master/json_logic/__init__.py#L180
-    def __bool__(self):
-        return True
-
 
 def _encode_dots_in_vars(expr: dict):
     """
     URL-encode any dots in variables. Otherwise, JsonLogic thinks they are property lookups.
-    Example expression: { "==": [ { "var": "slice/lineage/1/label" }, "Breast" ] }
+    This was important back when we would use slice IDs as var names. Example:
+    { "var": "slice/msi-0584.6%2Fmsi/CCLE (NGS)/label" }
+    This is no longer much of an issue because the UI now generates simplified var names.
     """
 
     def walk(node, key):
@@ -249,7 +164,11 @@ def walk(node, key):
 
 
 def _escape_dots(d: dict) -> dict:
-    """Return a new dict with all dots in keys replaced by %2E."""
+    """
+    Return a new dict with all dots in keys replaced by %2E. This is the complement to
+    _encode_dots_in_vars, ensuring entries in the `slice_query_vars` dict will match
+    those found in { "var": "..." } expressions.
+    """
     return {
         (k.replace(".", "%2E") if isinstance(k, str) else k): v for k, v in d.items()
     }
@@ -1,6 +1,7 @@
 import os
 import numpy as np
 from typing import List, Optional
+from breadbox.crud.dimension_ids import get_dimension_type_label_mapping_df
 
 from breadbox.depmap_compute_embed.slice import SliceQuery
 from breadbox.db.session import SessionWithUser
@@ -22,6 +23,7 @@
 import packed_cor_tables
 
 log = logging.getLogger(__name__)
+from breadbox.utils.profiling import profiled_region
 
 
 def get_associations(
@@ -32,34 +34,20 @@ def get_associations(
 ) -> Associations:
     dataset_id = slice_query.dataset_id
 
-    dataset = dataset_crud.get_dataset(db, db.user, dataset_id)
-    if dataset is None:
-        raise ResourceNotFoundError(f"Could not find dataset {dataset_id}")
+    with profiled_region("in get_associations: get dataset"):
+        dataset = dataset_crud.get_dataset(db, db.user, dataset_id)
+        if dataset is None:
+            raise ResourceNotFoundError(f"Could not find dataset {dataset_id}")
 
-    precomputed_assoc_tables = associations_crud.get_association_tables(
-        db, dataset.id, association_datasets
-    )
+    with profiled_region("in get_associations: get_association_tables"):
+        precomputed_assoc_tables = associations_crud.get_association_tables(
+            db, dataset.id, association_datasets
+        )
     datasets = []
     associated_dimensions = []
 
-    resolved_slice = slice_service.resolve_slice_to_components(db, slice_query,)
-
-    dim_label_cache = {}
-
-    def _get_dimension_label(dimension_type, given_id):
-        # if the dimension type is None, we use the dataset's dimension given_id as the label
-        if not dimension_type:
-            return given_id
-        if dimension_type not in dim_label_cache:
-            dim_label_cache[dimension_type] = get_dimension_type_labels_by_id(
-                db, dimension_type
-            )
-        labels_by_id = dim_label_cache[dimension_type]
-        if given_id in labels_by_id:
-            return labels_by_id[given_id]
-        else:
-            # there is a dimension type, and all valid given_ids are defined in that dimension type. If given_id is not included in the dimension type, we want act like that dimension doesn't exist
-            return None
+    with profiled_region("in get_associations: resolve_slice_to_components"):
+        resolved_slice = slice_service.resolve_slice_to_components(db, slice_query,)
 
     for precomputed_assoc_table in precomputed_assoc_tables:
         assert precomputed_assoc_table.dataset_1_id == dataset.id
@@ -85,38 +73,49 @@ def _get_dimension_label(dimension_type, given_id):
             filestore_location, precomputed_assoc_table.filename
         )
 
-        correlation_df = packed_cor_tables.read_cor_for_given_id(
-            precomputed_assoc_table_path, resolved_slice.given_id
-        )
-
-        for row in correlation_df.to_records():
-            other_dimension_given_id = row["feature_given_id_1"]
-            associated_label = _get_dimension_label(
-                other_dimension_type, other_dimension_given_id
+        with profiled_region("in get_associations: read_cor_for_given_id"):
+            correlation_df = packed_cor_tables.read_cor_for_given_id(
+                precomputed_assoc_table_path, resolved_slice.given_id
             )
-            if associated_label is None:
-                log.warning(
-                    f"Could not find {other_dimension_type} with id {other_dimension_given_id}"
-                )
-                continue
-
-            log10qvalue = row["log10qvalue"]
-
-            # if correlation is 1 then the qvalue can be 0 which results in log10 qvalue to be -inf
-            # if we see this, bound it at -1e100 to avoid json serialization error
-            if np.isinf(log10qvalue):
-                log10qvalue = -1e100
-
-            associated_dimensions.append(
-                Association(
-                    correlation=row["cor"],
-                    log10qvalue=log10qvalue,
-                    other_dataset_id=other_dataset.id,
-                    other_dataset_given_id=other_dataset.given_id,
-                    other_dimension_given_id=other_dimension_given_id,
-                    other_dimension_label=associated_label,
+
+        # look up all labels with a single query to produce a map that we'll use a little later.
+        label_id_mapping_df = get_dimension_type_label_mapping_df(
+            db,
+            other_dimension_type,
+            given_ids=correlation_df["feature_given_id_1"].tolist(),
+        )
+        label_by_given_id = {
+            row["given_id"]: row["label"]
+            for row in label_id_mapping_df.reset_index(drop=True).to_records()
+        }
+
+        with profiled_region("in get_associations: create Association records"):
+            for row in correlation_df.to_records():
+                other_dimension_given_id = row["feature_given_id_1"]
+                associated_label = label_by_given_id.get(other_dimension_given_id)
+                if associated_label is None:
+                    log.warning(
+                        f"Could not find {other_dimension_type} with id {other_dimension_given_id}"
+                    )
+                    continue
+
+                log10qvalue = row["log10qvalue"]
+
+                # if correlation is 1 then the qvalue can be 0 which results in log10 qvalue to be -inf
+                # if we see this, bound it at -1e100 to avoid json serialization error
+                if np.isinf(log10qvalue):
+                    log10qvalue = -1e100
+
+                associated_dimensions.append(
+                    Association(
+                        correlation=row["cor"],
+                        log10qvalue=log10qvalue,
+                        other_dataset_id=other_dataset.id,
+                        other_dataset_given_id=other_dataset.given_id,
+                        other_dimension_given_id=other_dimension_given_id,
+                        other_dimension_label=associated_label,
+                    )
                 )
-            )
 
     return Associations(
         dataset_name=dataset.name,
 
@@ -157,14 +157,20 @@ def row_generator():
                 given_id=given_id,
                 metadata_cache=metadata_cache,
             ):
+                label = cache_entry.get_label_for_given_id(given_id)
+                if label is None:
+                    # if we don't have a label, this given_id didn't exist
+                    # in metadata, so move on
+                    continue
+
                 # if given_id in cache_entry.dimension_id_by_given_id:
                 yield dict(
                     property=record.property,
                     value=record.value,
                     group_id=dimension_type.dataset.group_id,
                     dimension_type_name=dimension_type.name,
                     dimension_given_id=given_id,
-                    label=cache_entry.get_label_for_given_id(given_id),
+                    label=label,
                 )
 
     dimension_search_index_row_count = 0
 
@@ -0,0 +1,22 @@
+from contextlib import contextmanager
+import time
+import contextvars
+
+profile_depth = contextvars.ContextVar("profile_depth", default=1)
+
+PRINT_PROFILE = False
+
+
+@contextmanager
+def profiled_region(msg):
+    if not PRINT_PROFILE:
+        yield
+        return
+
+    orig_depth = profile_depth.get()
+    profile_depth.set(orig_depth + 1)
+    start = time.perf_counter()
+    yield
+    elapsed = time.perf_counter() - start
+    print(f"{'>>>' * orig_depth} {msg}: {elapsed:.3} secs elapsed")
+    profile_depth.set(orig_depth + 1)
@@ -6,11 +6,18 @@ if [ "$1" = "" ]; then
 fi
 IMAGE_TAG="$1"
 
+if [ ! -e .git ] ; then
+  echo "This command only works when  run from the root of the git checkout. Change directory before running this command"
+  exit 1
+fi
+
 set -ex
 
+# save the current sha to help track what we built this docker image from
+git rev-parse HEAD > breadbox/git-sha
+
 # Build Docker image
 export DOCKER_BUILDKIT=1
-#docker buildx build --platform=linux/amd64 \
 docker build \
  breadbox \
  -t "$IMAGE_TAG" \
 
@@ -33,7 +33,7 @@ sqlitedict="^2.1.0"
 google-cloud-storage = "^3.1.0"
 packed-cor-tables = {version = "^0.2.0", source = "public-python"}
 orjson = "^3.10.16"
-pypatch-and-run = {version = "^0.4.1", source = "public-python"}
+pypatch-and-run = {version = "^1.0.2", source = "public-python"}
 python-multipart = "^0.0.20" # fastapi 0.115.12 needs this
 psutil = "^7.1.0"
 #attrs = ">=21.3.0"