broadinstitute
diff --git a/‎.github/workflows/record_pytest_durations.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/record_pytest_durations.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎breadbox-client/bump_version_and_publish.py‎
Lines changed: 6 additions & 1 deletion b/‎breadbox-client/bump_version_and_publish.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎breadbox-client/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎breadbox-client/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎breadbox/alembic/versions/a33ed87f86ff_add_release_version_models.py‎
Lines changed: 129 additions & 0 deletions b/‎breadbox/alembic/versions/a33ed87f86ff_add_release_version_models.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎breadbox/breadbox/api/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎breadbox/breadbox/api/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎breadbox/breadbox/api/datasets.py‎
Lines changed: 45 additions & 2 deletions b/‎breadbox/breadbox/api/datasets.py‎
Lines changed: 45 additions & 2 deletions
diff --git a/‎breadbox/breadbox/api/release_files.py‎
Lines changed: 54 additions & 0 deletions b/‎breadbox/breadbox/api/release_files.py‎
Lines changed: 54 additions & 0 deletions
@@ -13,7 +13,7 @@ jobs:
       - name: Set up Python 3.9
         uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
         with:
-          python-version: 3.9
+          python-version: 3.13
       - name: Cache pip
         uses: actions/cache@6f8efc29b200d32929f49075959781ed54ec270c # v3.5.0
         with:
 
@@ -197,7 +197,12 @@ def rule_from_conventional_commit_type(commit_type, is_breaking):
     elif commit_type in MINOR_CONVENTIONAL_COMMIT_TYPES:
         return lambda major, minor, patch: (major, minor+1, 0)
     elif commit_type in IGNORE_CONVENTIONAL_COMMIT_TYPES:
-        return lambda major, minor, patch: (major, minor, patch)
+        # Ignored types (build, chore, ci, docs, ...) must not yield a bump
+        # rule at all. Returning a no-op lambda here causes get_bumps() to
+        # yield these commits, which then makes the script "bump" the version
+        # to the same value it already has -- tripping the assert in
+        # update_version_in_files() when the regex substitution is a no-op.
+        return None
     else:
         return None
 
 
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "breadbox-client"
-version = "4.9.0"
+version = "4.12.0"
 description = "A client library for accessing Breadbox"
 
 authors = []
 
@@ -0,0 +1,129 @@
+"""Add release version models
+
+Revision ID: a33ed87f86ff
+Revises: 0c0dd1a8925c
+Create Date: 2026-04-07 18:23:02.276428
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "a33ed87f86ff"
+down_revision = "0c0dd1a8925c"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # 1. Add the Virtual Table
+    op.execute(
+        """
+        CREATE VIRTUAL TABLE IF NOT EXISTS release_file_search_index USING fts5(
+        file_id,
+        file_name,
+        file_description,
+        file_datatype,
+        release_version_name,
+        release_name,
+        release_version_description,
+        release_version_content_hash,
+        tokenize='unicode61'
+        );
+    """
+    )
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "release_version",
+        sa.Column("version_name", sa.String(), nullable=False),
+        sa.Column("version_date", sa.Date(), nullable=False),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("content_hash", sa.String(length=32), nullable=False),
+        sa.Column("release_name", sa.String(), nullable=False),
+        sa.Column("citation", sa.String(), nullable=True),
+        sa.Column("funding", sa.String(), nullable=True),
+        sa.Column("terms", sa.String(), nullable=True),
+        sa.Column("id", sa.String(length=36), nullable=False),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_release_version")),
+        sa.UniqueConstraint(
+            "version_name", "release_name", name=op.f("uq_release_version_version_name")
+        ),
+    )
+    with op.batch_alter_table("release_version", schema=None) as batch_op:
+        batch_op.create_index(
+            batch_op.f("ix_release_version_content_hash"),
+            ["content_hash"],
+            unique=False,
+        )
+        batch_op.create_index(
+            batch_op.f("ix_release_version_release_name"),
+            ["release_name"],
+            unique=False,
+        )
+        batch_op.create_index(
+            batch_op.f("ix_release_version_version_name"),
+            ["version_name"],
+            unique=False,
+        )
+
+    op.create_table(
+        "release_file",
+        sa.Column("release_version_id", sa.String(), nullable=False),
+        sa.Column("file_name", sa.String(), nullable=False),
+        sa.Column("datatype", sa.String(), nullable=False),
+        sa.Column("size", sa.String(), nullable=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("bucket_url", sa.String(), nullable=True),
+        sa.Column("taiga_id", sa.String(), nullable=True),
+        sa.Column("canonical_taiga_id", sa.String(), nullable=True),
+        sa.Column("md5_hash", sa.String(length=32), nullable=True),
+        sa.Column("version", sa.Integer(), nullable=True),
+        sa.Column("pipeline_name", sa.String(), nullable=True),
+        sa.Column("is_main_file", sa.Boolean(), nullable=False),
+        sa.Column("id", sa.String(length=36), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["release_version_id"],
+            ["release_version.id"],
+            name=op.f("fk_release_file_release_version_id_release_version"),
+            ondelete="CASCADE",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_release_file")),
+    )
+    with op.batch_alter_table("release_file", schema=None) as batch_op:
+        batch_op.create_index(
+            batch_op.f("ix_release_file_file_name"), ["file_name"], unique=False
+        )
+
+    op.create_table(
+        "release_pipeline",
+        sa.Column("release_version_id", sa.String(), nullable=False),
+        sa.Column("pipeline_name", sa.String(), nullable=False),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("id", sa.String(length=36), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["release_version_id"],
+            ["release_version.id"],
+            name=op.f("fk_release_pipeline_release_version_id_release_version"),
+            ondelete="CASCADE",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_release_pipeline")),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("release_pipeline")
+    with op.batch_alter_table("release_file", schema=None) as batch_op:
+        batch_op.drop_index(batch_op.f("ix_release_file_file_name"))
+
+    op.drop_table("release_file")
+    with op.batch_alter_table("release_version", schema=None) as batch_op:
+        batch_op.drop_index(batch_op.f("ix_release_version_version_name"))
+        batch_op.drop_index(batch_op.f("ix_release_version_release_name"))
+        batch_op.drop_index(batch_op.f("ix_release_version_content_hash"))
+
+    op.drop_table("release_version")
+    op.execute("DROP TABLE IF EXISTS release_file_search_index")
+    # ### end Alembic commands ###
@@ -3,6 +3,8 @@
 
 from .uploads import router as uploads_router
 from .datasets import router as datasets_router
+from .release_versions import router as release_versions_router
+from .release_files import router as release_files_router
 from .dataset_uploads import router as dataset_uploads_router
 from .downloads import router as downloads_router
 from .groups import router as groups_router
@@ -20,6 +22,8 @@
 
 api_router = APIRouter(responses=ERROR_RESPONSES)  # type: ignore
 api_router.include_router(datasets_router)
+api_router.include_router(release_versions_router)
+api_router.include_router(release_files_router)
 api_router.include_router(dataset_uploads_router)
 api_router.include_router(uploads_router)
 api_router.include_router(downloads_router)
 
@@ -57,6 +57,7 @@
     DimensionDataResponse,
     SliceQueryIdentifierType,
 )
+from breadbox.schemas.context import SliceQueryRef
 from breadbox.service import dataset as dataset_service
 from breadbox.service import metadata as metadata_service
 from breadbox.service import slice as slice_service
@@ -69,6 +70,26 @@
 log = getLogger(__name__)
 
 
+def _to_internal_slice_query(ref: SliceQueryRef) -> SliceQuery:
+    """Convert a Pydantic SliceQueryRef to the internal SliceQuery dataclass."""
+    return SliceQuery(
+        dataset_id=ref.dataset_id,
+        identifier=ref.identifier,
+        identifier_type=ref.identifier_type.value,
+        reindex_through=_to_internal_slice_query(ref.reindex_through)
+        if ref.reindex_through
+        else None,
+    )
+
+
+def _get_root_query(sq: SliceQuery) -> SliceQuery:
+    """Chase reindex_through to find the root (innermost) step in the chain."""
+    current = sq
+    while current.reindex_through is not None:
+        current = current.reindex_through
+    return current
+
+
 @router.get(
     "/",
     operation_id="get_datasets",
@@ -411,7 +432,13 @@ def get_dimensions(
     response_model=DimensionDataResponse,
 )
 def get_dimension_data(
-    # The request body should be a SliceQuery with the following three fields:
+    # TODO: This endpoint inlines the SliceQuery fields as individual Body() params
+    # instead of referencing the SliceQuery schema directly. This is historical drift.
+    # We're keeping it this way for now to avoid breaking the auto-generated Breadbox
+    # Python client and the Breadbox Facade, which may depend on the Body_get_dimension_data
+    # schema name and structure. Once we've confirmed those consumers can handle the
+    # change, we should refactor this to accept a single SliceQuery (or SliceQueryRef)
+    # body parameter, which would also make the OpenAPI spec self-documenting.
     dataset_id: Annotated[str, Body(description="The UUID or given ID of a dataset.")],
     identifier: Annotated[
         str,
@@ -425,6 +452,12 @@ def get_dimension_data(
             description="Denotes the type of identifier being used and the axis being queried."
         ),
     ],
+    reindex_through: Annotated[
+        Optional[SliceQueryRef],
+        Body(
+            description="Optional chain of FK joins to reindex the result by a different dimension type."
+        ),
+    ] = None,
     db: SessionWithUser = Depends(get_db_with_user),
     settings: Settings = Depends(get_settings),
 ):
@@ -435,11 +468,21 @@ def get_dimension_data(
         dataset_id=dataset_id,
         identifier=identifier,
         identifier_type=identifier_type.name,
+        reindex_through=_to_internal_slice_query(reindex_through)
+        if reindex_through
+        else None,
     )
     slice_values_by_id = slice_service.get_slice_data(
         db, settings.filestore_location, parsed_slice_query
     )
-    labels_by_id = metadata_service.get_labels_for_slice_type(db, parsed_slice_query)
+
+    # When reindex_through is present, the result is indexed by the root's entity IDs,
+    # so labels must come from the root's dimension type, not the leaf's.
+    label_query = parsed_slice_query
+    if parsed_slice_query.reindex_through is not None:
+        label_query = _get_root_query(parsed_slice_query)
+
+    labels_by_id = metadata_service.get_labels_for_slice_type(db, label_query)
 
     # Only the values which have corresponding metadata should be returned
     all_dataset_given_ids = slice_values_by_id.index.to_list()
 
@@ -0,0 +1,54 @@
+from typing import List
+
+from fastapi import APIRouter, Depends, Query
+
+from breadbox.schemas.release_version import ReleaseFileSearchResponse
+from ..crud import release_version as release_version_crud
+from breadbox.api.dependencies import get_db_with_user
+from breadbox.db.session import SessionWithUser
+
+# Separated from release-versions to reduce confusion about the level
+# of granularity of the full text search. Search returns release file
+# level data.
+router = APIRouter(prefix="/release-files", tags=["release-files"])
+
+
+@router.get(
+    "/search",
+    response_model=List[ReleaseFileSearchResponse],
+    operation_id="search_release_files",
+)
+def search_release_files(
+    q: str = Query(
+        ...,
+        min_length=1,
+        description="Search query as the user types in the global searchbar.",
+    ),
+    limit: int = Query(
+        50, ge=1, le=100, description="Number of results to return per page. Max 100.",
+    ),  # ge "greater than or equal to", le "less than or equal to"
+    offset: int = Query(
+        0,
+        ge=0,
+        description="Number of results to skip from the beginning (used for pagination).",
+    ),
+    db: SessionWithUser = Depends(get_db_with_user),
+):
+    """
+    Search for individual files across all releases using the FTS5 index.
+    Returns denormalized metadata for each matching file.
+    
+    If you have 150 results:
+
+    Page 1: limit=50, offset=0 (Gets results 1-50)
+
+    Page 2: limit=50, offset=50 (Gets results 51-100)
+
+    Page 3: limit=50, offset=100 (Gets results 101-150)
+    """
+    # This uses the SQLite FTS5 'MATCH' operator
+    results = release_version_crud.search_release_files(
+        db=db, q=q, limit=limit, offset=offset
+    )
+
+    return results