Skip to content

Commit 4b711b4

Browse files
committed
Merge branch 'master' of github.com:broadinstitute/depmap-portal
2 parents f201d41 + 15eb80f commit 4b711b4

File tree

32 files changed

+3264
-160
lines changed

32 files changed

+3264
-160
lines changed

.github/workflows/record_pytest_durations.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
- name: Set up Python 3.9
1414
uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
1515
with:
16-
python-version: 3.9
16+
python-version: 3.13
1717
- name: Cache pip
1818
uses: actions/cache@6f8efc29b200d32929f49075959781ed54ec270c # v3.5.0
1919
with:

breadbox-client/bump_version_and_publish.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,12 @@ def rule_from_conventional_commit_type(commit_type, is_breaking):
197197
elif commit_type in MINOR_CONVENTIONAL_COMMIT_TYPES:
198198
return lambda major, minor, patch: (major, minor+1, 0)
199199
elif commit_type in IGNORE_CONVENTIONAL_COMMIT_TYPES:
200-
return lambda major, minor, patch: (major, minor, patch)
200+
# Ignored types (build, chore, ci, docs, ...) must not yield a bump
201+
# rule at all. Returning a no-op lambda here causes get_bumps() to
202+
# yield these commits, which then makes the script "bump" the version
203+
# to the same value it already has -- tripping the assert in
204+
# update_version_in_files() when the regex substitution is a no-op.
205+
return None
201206
else:
202207
return None
203208

breadbox-client/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "breadbox-client"
3-
version = "4.9.0"
3+
version = "4.12.0"
44
description = "A client library for accessing Breadbox"
55

66
authors = []
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Add release version models
2+
3+
Revision ID: a33ed87f86ff
4+
Revises: 0c0dd1a8925c
5+
Create Date: 2026-04-07 18:23:02.276428
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "a33ed87f86ff"
14+
down_revision = "0c0dd1a8925c"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# 1. Add the Virtual Table
21+
op.execute(
22+
"""
23+
CREATE VIRTUAL TABLE IF NOT EXISTS release_file_search_index USING fts5(
24+
file_id,
25+
file_name,
26+
file_description,
27+
file_datatype,
28+
release_version_name,
29+
release_name,
30+
release_version_description,
31+
release_version_content_hash,
32+
tokenize='unicode61'
33+
);
34+
"""
35+
)
36+
# ### commands auto generated by Alembic - please adjust! ###
37+
op.create_table(
38+
"release_version",
39+
sa.Column("version_name", sa.String(), nullable=False),
40+
sa.Column("version_date", sa.Date(), nullable=False),
41+
sa.Column("description", sa.String(), nullable=True),
42+
sa.Column("content_hash", sa.String(length=32), nullable=False),
43+
sa.Column("release_name", sa.String(), nullable=False),
44+
sa.Column("citation", sa.String(), nullable=True),
45+
sa.Column("funding", sa.String(), nullable=True),
46+
sa.Column("terms", sa.String(), nullable=True),
47+
sa.Column("id", sa.String(length=36), nullable=False),
48+
sa.PrimaryKeyConstraint("id", name=op.f("pk_release_version")),
49+
sa.UniqueConstraint(
50+
"version_name", "release_name", name=op.f("uq_release_version_version_name")
51+
),
52+
)
53+
with op.batch_alter_table("release_version", schema=None) as batch_op:
54+
batch_op.create_index(
55+
batch_op.f("ix_release_version_content_hash"),
56+
["content_hash"],
57+
unique=False,
58+
)
59+
batch_op.create_index(
60+
batch_op.f("ix_release_version_release_name"),
61+
["release_name"],
62+
unique=False,
63+
)
64+
batch_op.create_index(
65+
batch_op.f("ix_release_version_version_name"),
66+
["version_name"],
67+
unique=False,
68+
)
69+
70+
op.create_table(
71+
"release_file",
72+
sa.Column("release_version_id", sa.String(), nullable=False),
73+
sa.Column("file_name", sa.String(), nullable=False),
74+
sa.Column("datatype", sa.String(), nullable=False),
75+
sa.Column("size", sa.String(), nullable=True),
76+
sa.Column("description", sa.String(), nullable=True),
77+
sa.Column("bucket_url", sa.String(), nullable=True),
78+
sa.Column("taiga_id", sa.String(), nullable=True),
79+
sa.Column("canonical_taiga_id", sa.String(), nullable=True),
80+
sa.Column("md5_hash", sa.String(length=32), nullable=True),
81+
sa.Column("version", sa.Integer(), nullable=True),
82+
sa.Column("pipeline_name", sa.String(), nullable=True),
83+
sa.Column("is_main_file", sa.Boolean(), nullable=False),
84+
sa.Column("id", sa.String(length=36), nullable=False),
85+
sa.ForeignKeyConstraint(
86+
["release_version_id"],
87+
["release_version.id"],
88+
name=op.f("fk_release_file_release_version_id_release_version"),
89+
ondelete="CASCADE",
90+
),
91+
sa.PrimaryKeyConstraint("id", name=op.f("pk_release_file")),
92+
)
93+
with op.batch_alter_table("release_file", schema=None) as batch_op:
94+
batch_op.create_index(
95+
batch_op.f("ix_release_file_file_name"), ["file_name"], unique=False
96+
)
97+
98+
op.create_table(
99+
"release_pipeline",
100+
sa.Column("release_version_id", sa.String(), nullable=False),
101+
sa.Column("pipeline_name", sa.String(), nullable=False),
102+
sa.Column("description", sa.String(), nullable=True),
103+
sa.Column("id", sa.String(length=36), nullable=False),
104+
sa.ForeignKeyConstraint(
105+
["release_version_id"],
106+
["release_version.id"],
107+
name=op.f("fk_release_pipeline_release_version_id_release_version"),
108+
ondelete="CASCADE",
109+
),
110+
sa.PrimaryKeyConstraint("id", name=op.f("pk_release_pipeline")),
111+
)
112+
# ### end Alembic commands ###
113+
114+
115+
def downgrade():
116+
# ### commands auto generated by Alembic - please adjust! ###
117+
op.drop_table("release_pipeline")
118+
with op.batch_alter_table("release_file", schema=None) as batch_op:
119+
batch_op.drop_index(batch_op.f("ix_release_file_file_name"))
120+
121+
op.drop_table("release_file")
122+
with op.batch_alter_table("release_version", schema=None) as batch_op:
123+
batch_op.drop_index(batch_op.f("ix_release_version_version_name"))
124+
batch_op.drop_index(batch_op.f("ix_release_version_release_name"))
125+
batch_op.drop_index(batch_op.f("ix_release_version_content_hash"))
126+
127+
op.drop_table("release_version")
128+
op.execute("DROP TABLE IF EXISTS release_file_search_index")
129+
# ### end Alembic commands ###

breadbox/breadbox/api/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from .uploads import router as uploads_router
55
from .datasets import router as datasets_router
6+
from .release_versions import router as release_versions_router
7+
from .release_files import router as release_files_router
68
from .dataset_uploads import router as dataset_uploads_router
79
from .downloads import router as downloads_router
810
from .groups import router as groups_router
@@ -20,6 +22,8 @@
2022

2123
api_router = APIRouter(responses=ERROR_RESPONSES) # type: ignore
2224
api_router.include_router(datasets_router)
25+
api_router.include_router(release_versions_router)
26+
api_router.include_router(release_files_router)
2327
api_router.include_router(dataset_uploads_router)
2428
api_router.include_router(uploads_router)
2529
api_router.include_router(downloads_router)

breadbox/breadbox/api/datasets.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
DimensionDataResponse,
5858
SliceQueryIdentifierType,
5959
)
60+
from breadbox.schemas.context import SliceQueryRef
6061
from breadbox.service import dataset as dataset_service
6162
from breadbox.service import metadata as metadata_service
6263
from breadbox.service import slice as slice_service
@@ -69,6 +70,26 @@
6970
log = getLogger(__name__)
7071

7172

73+
def _to_internal_slice_query(ref: SliceQueryRef) -> SliceQuery:
74+
"""Convert a Pydantic SliceQueryRef to the internal SliceQuery dataclass."""
75+
return SliceQuery(
76+
dataset_id=ref.dataset_id,
77+
identifier=ref.identifier,
78+
identifier_type=ref.identifier_type.value,
79+
reindex_through=_to_internal_slice_query(ref.reindex_through)
80+
if ref.reindex_through
81+
else None,
82+
)
83+
84+
85+
def _get_root_query(sq: SliceQuery) -> SliceQuery:
86+
"""Chase reindex_through to find the root (innermost) step in the chain."""
87+
current = sq
88+
while current.reindex_through is not None:
89+
current = current.reindex_through
90+
return current
91+
92+
7293
@router.get(
7394
"/",
7495
operation_id="get_datasets",
@@ -411,7 +432,13 @@ def get_dimensions(
411432
response_model=DimensionDataResponse,
412433
)
413434
def get_dimension_data(
414-
# The request body should be a SliceQuery with the following three fields:
435+
# TODO: This endpoint inlines the SliceQuery fields as individual Body() params
436+
# instead of referencing the SliceQuery schema directly. This is historical drift.
437+
# We're keeping it this way for now to avoid breaking the auto-generated Breadbox
438+
# Python client and the Breadbox Facade, which may depend on the Body_get_dimension_data
439+
# schema name and structure. Once we've confirmed those consumers can handle the
440+
# change, we should refactor this to accept a single SliceQuery (or SliceQueryRef)
441+
# body parameter, which would also make the OpenAPI spec self-documenting.
415442
dataset_id: Annotated[str, Body(description="The UUID or given ID of a dataset.")],
416443
identifier: Annotated[
417444
str,
@@ -425,6 +452,12 @@ def get_dimension_data(
425452
description="Denotes the type of identifier being used and the axis being queried."
426453
),
427454
],
455+
reindex_through: Annotated[
456+
Optional[SliceQueryRef],
457+
Body(
458+
description="Optional chain of FK joins to reindex the result by a different dimension type."
459+
),
460+
] = None,
428461
db: SessionWithUser = Depends(get_db_with_user),
429462
settings: Settings = Depends(get_settings),
430463
):
@@ -435,11 +468,21 @@ def get_dimension_data(
435468
dataset_id=dataset_id,
436469
identifier=identifier,
437470
identifier_type=identifier_type.name,
471+
reindex_through=_to_internal_slice_query(reindex_through)
472+
if reindex_through
473+
else None,
438474
)
439475
slice_values_by_id = slice_service.get_slice_data(
440476
db, settings.filestore_location, parsed_slice_query
441477
)
442-
labels_by_id = metadata_service.get_labels_for_slice_type(db, parsed_slice_query)
478+
479+
# When reindex_through is present, the result is indexed by the root's entity IDs,
480+
# so labels must come from the root's dimension type, not the leaf's.
481+
label_query = parsed_slice_query
482+
if parsed_slice_query.reindex_through is not None:
483+
label_query = _get_root_query(parsed_slice_query)
484+
485+
labels_by_id = metadata_service.get_labels_for_slice_type(db, label_query)
443486

444487
# Only the values which have corresponding metadata should be returned
445488
all_dataset_given_ids = slice_values_by_id.index.to_list()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from typing import List
2+
3+
from fastapi import APIRouter, Depends, Query
4+
5+
from breadbox.schemas.release_version import ReleaseFileSearchResponse
6+
from ..crud import release_version as release_version_crud
7+
from breadbox.api.dependencies import get_db_with_user
8+
from breadbox.db.session import SessionWithUser
9+
10+
# Separated from release-versions to reduce confusion about the level
11+
# of granularity of the full text search. Search returns release file
12+
# level data.
13+
router = APIRouter(prefix="/release-files", tags=["release-files"])
14+
15+
16+
@router.get(
17+
"/search",
18+
response_model=List[ReleaseFileSearchResponse],
19+
operation_id="search_release_files",
20+
)
21+
def search_release_files(
22+
q: str = Query(
23+
...,
24+
min_length=1,
25+
description="Search query as the user types in the global searchbar.",
26+
),
27+
limit: int = Query(
28+
50, ge=1, le=100, description="Number of results to return per page. Max 100.",
29+
), # ge "greater than or equal to", le "less than or equal to"
30+
offset: int = Query(
31+
0,
32+
ge=0,
33+
description="Number of results to skip from the beginning (used for pagination).",
34+
),
35+
db: SessionWithUser = Depends(get_db_with_user),
36+
):
37+
"""
38+
Search for individual files across all releases using the FTS5 index.
39+
Returns denormalized metadata for each matching file.
40+
41+
If you have 150 results:
42+
43+
Page 1: limit=50, offset=0 (Gets results 1-50)
44+
45+
Page 2: limit=50, offset=50 (Gets results 51-100)
46+
47+
Page 3: limit=50, offset=100 (Gets results 101-150)
48+
"""
49+
# This uses the SQLite FTS5 'MATCH' operator
50+
results = release_version_crud.search_release_files(
51+
db=db, q=q, limit=limit, offset=offset
52+
)
53+
54+
return results

0 commit comments

Comments
 (0)