Skip to content

Commit 55fb664

Browse files
committed
Merge branch 'master' into qa
2 parents 8310b58 + 322e49e commit 55fb664

36 files changed

Lines changed: 631 additions & 876 deletions

File tree

breadbox/Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ RUN pyenv install ${PYTHON_VERSION} && \
3434
# Create the installation directory
3535
RUN mkdir -p /install/breadbox
3636

37+
# Copy in a file containing the git SHA (created in build-docker-image.sh)
38+
COPY git-sha /install/git-sha
39+
3740
# Copy files:
3841
COPY . /install/breadbox
3942
WORKDIR /install/breadbox

breadbox/breadbox/depmap_compute_embed/context.py

Lines changed: 18 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,15 @@ def __init__(
4141
):
4242
"""
4343
A `context` dict should have:
44-
- a `dimension_type` such as "depmap_model" (which is not used in this evaluator)
4544
- an `expr` such as { "==": [ { "var": "var1" }, "Breast" ] }
46-
- a set of `vars`, each of which assigns a name to a slice query
45+
- a dictionary of `vars` which assigns names to slice queries, such as
46+
{
47+
"var1": {
48+
"dataset_id": "depmap_model_metadata",
49+
"identifier": "OncotreeLineage",
50+
"identifier_type": "column"
51+
}
52+
}
4753
"""
4854
self.expr = _encode_dots_in_vars(context["expr"])
4955
self.slice_query_vars = _escape_dots(context.get("vars", {}))
@@ -52,8 +58,8 @@ def __init__(
5258
self.get_slice_data = get_slice_data
5359

5460
# The cache is used so that slice values only need to be looked up once per context.
55-
# - The keys in this dictionary are slice queries encoded as a tuple
56-
# (ex. `("Chronos_combined", "SOX10", "feature_label")`).
61+
# - The keys in this dictionary match the values of an any { "var": "<var_name>" }
62+
# expressions (and therefore should also match the keys of context["vars"]).
5763
# - The values are each an entire dictionary of slice values (indexed by given ID)
5864
self.cache = {}
5965

@@ -135,104 +141,13 @@ def __bool__(self):
135141
return True
136142

137143

138-
class LegacyContextEvaluator:
139-
"""
140-
DEPRECATED: Use `ContextEvaluator` for future development.
141-
This older version has a few differences from the new one:
142-
- Slices are specified using string slice IDs instead of slice queries
143-
- Matching on index is done with a field called "entity_label". For features,
144-
this is expected to match the feature label. For samples, this matches on sample ID (not label).
145-
This confusing behavior is part of why we're deprecating the old version.
146-
- the field "context_type" is used to specify the dimension type
147-
"""
148-
149-
def __init__(self, context: dict, get_slice_data: Callable[[str], dict[str, Any]]):
150-
"""
151-
A `context` dict should have:
152-
- a `context_type` such as "depmap_model"
153-
- an `expr` such as { "==": [ { "var": "slice/lineage/1/label" }, "Breast" ] }
154-
"""
155-
self.context_type = context["context_type"]
156-
self.expr = _encode_dots_in_vars(context["expr"])
157-
# Cache is keyed by Slice ID. Each value is an entire dictionary of slice values.
158-
self.cache = {}
159-
self.get_slice_data = get_slice_data
160-
161-
def is_match(self, dimension_label: str):
162-
"""
163-
This evaluates `expr` against a given `dimension_label`. It returns
164-
True/False depending on if `dimension_label` satifies the conditions of
165-
the expression, including any variables ("var" subexpressions) which
166-
are bound by using a magic dict that does lookups lazily.
167-
"""
168-
dictionary_override = _LegacyLazyContextDict(
169-
self.context_type, dimension_label, self.cache, self.get_slice_data
170-
)
171-
172-
try:
173-
return jsonLogic(self.expr, dictionary_override)
174-
except (TypeError, ValueError) as e:
175-
print("Exception evaluating", self.expr, "against", dimension_label)
176-
print(e)
177-
return False
178-
179-
180-
class _LegacyLazyContextDict(dict):
181-
"""
182-
The JsonLogic library wants to be passed a dictionary of values. However, we need to
183-
inject our own special cases and caching, so we override the dictionary class with
184-
special functionality. Interesting thread on overriding the Dict class:
185-
https://stackoverflow.com/questions/3387691/how-to-perfectly-override-a-dict
186-
But we don't need to "perfectly" override it; just well enough to trick the JsonLogic library.
187-
188-
This implementation uses and updates the cache that's been passed in to the constructor.
189-
"""
190-
191-
def __init__(
192-
self,
193-
context_type: str,
194-
dimension_label: str,
195-
cache: dict,
196-
get_slice_data: Callable[[str], dict[str, Any]],
197-
):
198-
self.context_type = context_type
199-
self.dimension_label = dimension_label
200-
self.cache = cache
201-
self.get_slice_data = get_slice_data
202-
203-
def __getitem__(self, prop):
204-
"""
205-
Given a slice ID, get the slice value which corresponds to the
206-
"dimension_label" that's already been passed into the constructor of this class.
207-
"""
208-
# Handle trivial case where we're just looking up a dimension's own
209-
# label. Note that this is called "entity_label" for historical
210-
# reasons.
211-
if prop == "entity_label":
212-
return self.dimension_label
213-
214-
if prop.startswith("slice/"):
215-
if prop not in self.cache:
216-
self.cache[prop] = self.get_slice_data(prop)
217-
218-
return self.cache[prop][self.dimension_label]
219-
220-
raise LookupError(
221-
f"Unable to find context property '{prop}'. Are you sure a corresponding "
222-
f"dataset exists and can be looked up by {self.context_type}?"
223-
)
224-
225-
# We don't want our virtual dictionary to appear empty.
226-
# Otherwise, the JsonLogic library will stomp it out with an empty default dict:
227-
# https://github.com/nadirizr/json-logic-py/blob/master/json_logic/__init__.py#L180
228-
def __bool__(self):
229-
return True
230-
231144

232145
def _encode_dots_in_vars(expr: dict):
233146
"""
234147
URL-encode any dots in variables. Otherwise, JsonLogic thinks they are property lookups.
235-
Example expression: { "==": [ { "var": "slice/lineage/1/label" }, "Breast" ] }
148+
This was important back when we would use slice IDs as var names. Example:
149+
{ "var": "slice/msi-0584.6%2Fmsi/CCLE (NGS)/label" }
150+
This is no longer much of an issue because the UI now generates simplified var names.
236151
"""
237152

238153
def walk(node, key):
@@ -249,7 +164,11 @@ def walk(node, key):
249164

250165

251166
def _escape_dots(d: dict) -> dict:
252-
"""Return a new dict with all dots in keys replaced by %2E."""
167+
"""
168+
Return a new dict with all dots in keys replaced by %2E. This is the complement to
169+
_encode_dots_in_vars, ensuring entries in the `slice_query_vars` dict will match
170+
those found in { "var": "..." } expressions.
171+
"""
253172
return {
254173
(k.replace(".", "%2E") if isinstance(k, str) else k): v for k, v in d.items()
255174
}

breadbox/breadbox/service/associations.py

Lines changed: 53 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import numpy as np
33
from typing import List, Optional
4+
from breadbox.crud.dimension_ids import get_dimension_type_label_mapping_df
45

56
from breadbox.depmap_compute_embed.slice import SliceQuery
67
from breadbox.db.session import SessionWithUser
@@ -22,6 +23,7 @@
2223
import packed_cor_tables
2324

2425
log = logging.getLogger(__name__)
26+
from breadbox.utils.profiling import profiled_region
2527

2628

2729
def get_associations(
@@ -32,34 +34,20 @@ def get_associations(
3234
) -> Associations:
3335
dataset_id = slice_query.dataset_id
3436

35-
dataset = dataset_crud.get_dataset(db, db.user, dataset_id)
36-
if dataset is None:
37-
raise ResourceNotFoundError(f"Could not find dataset {dataset_id}")
37+
with profiled_region("in get_associations: get dataset"):
38+
dataset = dataset_crud.get_dataset(db, db.user, dataset_id)
39+
if dataset is None:
40+
raise ResourceNotFoundError(f"Could not find dataset {dataset_id}")
3841

39-
precomputed_assoc_tables = associations_crud.get_association_tables(
40-
db, dataset.id, association_datasets
41-
)
42+
with profiled_region("in get_associations: get_association_tables"):
43+
precomputed_assoc_tables = associations_crud.get_association_tables(
44+
db, dataset.id, association_datasets
45+
)
4246
datasets = []
4347
associated_dimensions = []
4448

45-
resolved_slice = slice_service.resolve_slice_to_components(db, slice_query,)
46-
47-
dim_label_cache = {}
48-
49-
def _get_dimension_label(dimension_type, given_id):
50-
# if the dimension type is None, we use the dataset's dimension given_id as the label
51-
if not dimension_type:
52-
return given_id
53-
if dimension_type not in dim_label_cache:
54-
dim_label_cache[dimension_type] = get_dimension_type_labels_by_id(
55-
db, dimension_type
56-
)
57-
labels_by_id = dim_label_cache[dimension_type]
58-
if given_id in labels_by_id:
59-
return labels_by_id[given_id]
60-
else:
61-
# there is a dimension type, and all valid given_ids are defined in that dimension type. If given_id is not included in the dimension type, we want act like that dimension doesn't exist
62-
return None
49+
with profiled_region("in get_associations: resolve_slice_to_components"):
50+
resolved_slice = slice_service.resolve_slice_to_components(db, slice_query,)
6351

6452
for precomputed_assoc_table in precomputed_assoc_tables:
6553
assert precomputed_assoc_table.dataset_1_id == dataset.id
@@ -85,38 +73,49 @@ def _get_dimension_label(dimension_type, given_id):
8573
filestore_location, precomputed_assoc_table.filename
8674
)
8775

88-
correlation_df = packed_cor_tables.read_cor_for_given_id(
89-
precomputed_assoc_table_path, resolved_slice.given_id
90-
)
91-
92-
for row in correlation_df.to_records():
93-
other_dimension_given_id = row["feature_given_id_1"]
94-
associated_label = _get_dimension_label(
95-
other_dimension_type, other_dimension_given_id
76+
with profiled_region("in get_associations: read_cor_for_given_id"):
77+
correlation_df = packed_cor_tables.read_cor_for_given_id(
78+
precomputed_assoc_table_path, resolved_slice.given_id
9679
)
97-
if associated_label is None:
98-
log.warning(
99-
f"Could not find {other_dimension_type} with id {other_dimension_given_id}"
100-
)
101-
continue
102-
103-
log10qvalue = row["log10qvalue"]
104-
105-
# if correlation is 1 then the qvalue can be 0 which results in log10 qvalue to be -inf
106-
# if we see this, bound it at -1e100 to avoid json serialization error
107-
if np.isinf(log10qvalue):
108-
log10qvalue = -1e100
109-
110-
associated_dimensions.append(
111-
Association(
112-
correlation=row["cor"],
113-
log10qvalue=log10qvalue,
114-
other_dataset_id=other_dataset.id,
115-
other_dataset_given_id=other_dataset.given_id,
116-
other_dimension_given_id=other_dimension_given_id,
117-
other_dimension_label=associated_label,
80+
81+
# look up all labels with a single query to produce a map that we'll use a little later.
82+
label_id_mapping_df = get_dimension_type_label_mapping_df(
83+
db,
84+
other_dimension_type,
85+
given_ids=correlation_df["feature_given_id_1"].tolist(),
86+
)
87+
label_by_given_id = {
88+
row["given_id"]: row["label"]
89+
for row in label_id_mapping_df.reset_index(drop=True).to_records()
90+
}
91+
92+
with profiled_region("in get_associations: create Association records"):
93+
for row in correlation_df.to_records():
94+
other_dimension_given_id = row["feature_given_id_1"]
95+
associated_label = label_by_given_id.get(other_dimension_given_id)
96+
if associated_label is None:
97+
log.warning(
98+
f"Could not find {other_dimension_type} with id {other_dimension_given_id}"
99+
)
100+
continue
101+
102+
log10qvalue = row["log10qvalue"]
103+
104+
# if correlation is 1 then the qvalue can be 0 which results in log10 qvalue to be -inf
105+
# if we see this, bound it at -1e100 to avoid json serialization error
106+
if np.isinf(log10qvalue):
107+
log10qvalue = -1e100
108+
109+
associated_dimensions.append(
110+
Association(
111+
correlation=row["cor"],
112+
log10qvalue=log10qvalue,
113+
other_dataset_id=other_dataset.id,
114+
other_dataset_given_id=other_dataset.given_id,
115+
other_dimension_given_id=other_dimension_given_id,
116+
other_dimension_label=associated_label,
117+
)
118118
)
119-
)
120119

121120
return Associations(
122121
dataset_name=dataset.name,

breadbox/breadbox/service/search.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,20 @@ def row_generator():
157157
given_id=given_id,
158158
metadata_cache=metadata_cache,
159159
):
160+
label = cache_entry.get_label_for_given_id(given_id)
161+
if label is None:
162+
# if we don't have a label, this given_id didn't exist
163+
# in metadata, so move on
164+
continue
165+
160166
# if given_id in cache_entry.dimension_id_by_given_id:
161167
yield dict(
162168
property=record.property,
163169
value=record.value,
164170
group_id=dimension_type.dataset.group_id,
165171
dimension_type_name=dimension_type.name,
166172
dimension_given_id=given_id,
167-
label=cache_entry.get_label_for_given_id(given_id),
173+
label=label,
168174
)
169175

170176
dimension_search_index_row_count = 0
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from contextlib import contextmanager
2+
import time
3+
import contextvars
4+
5+
profile_depth = contextvars.ContextVar("profile_depth", default=1)
6+
7+
PRINT_PROFILE = False
8+
9+
10+
@contextmanager
11+
def profiled_region(msg):
12+
if not PRINT_PROFILE:
13+
yield
14+
return
15+
16+
orig_depth = profile_depth.get()
17+
profile_depth.set(orig_depth + 1)
18+
start = time.perf_counter()
19+
yield
20+
elapsed = time.perf_counter() - start
21+
print(f"{'>>>' * orig_depth} {msg}: {elapsed:.3} secs elapsed")
22+
profile_depth.set(orig_depth + 1)

breadbox/build-docker-image.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,18 @@ if [ "$1" = "" ]; then
66
fi
77
IMAGE_TAG="$1"
88

9+
if [ ! -e .git ] ; then
10+
echo "This command only works when run from the root of the git checkout. Change directory before running this command"
11+
exit 1
12+
fi
13+
914
set -ex
1015

16+
# save the current sha to help track what we built this docker image from
17+
git rev-parse HEAD > breadbox/git-sha
18+
1119
# Build Docker image
1220
export DOCKER_BUILDKIT=1
13-
#docker buildx build --platform=linux/amd64 \
1421
docker build \
1522
breadbox \
1623
-t "$IMAGE_TAG" \

breadbox/poetry.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

breadbox/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ sqlitedict="^2.1.0"
3333
google-cloud-storage = "^3.1.0"
3434
packed-cor-tables = {version = "^0.2.0", source = "public-python"}
3535
orjson = "^3.10.16"
36-
pypatch-and-run = {version = "^0.4.1", source = "public-python"}
36+
pypatch-and-run = {version = "^1.0.2", source = "public-python"}
3737
python-multipart = "^0.0.20" # fastapi 0.115.12 needs this
3838
psutil = "^7.1.0"
3939
#attrs = ">=21.3.0"

0 commit comments

Comments
 (0)