feature: add task filtering based on metadata (#449)

saikonen · web-flow · commit 97973df0e334 · 2025-03-17T12:12:35.000+02:00
diff --git a/services/data/postgres_async_db.py b/services/data/postgres_async_db.py
@@ -814,6 +814,57 @@ async def get_metadata(
         }
         return await self.get_records(filter_dict=filter_dict)
 
+    async def get_filtered_task_pathspecs(self, flow_id: str, run_id: str, step_name: str, field_name: str, pattern: str):
+        """
+        Returns a list of task pathspecs that match the given field_name and regexp pattern for the value
+        """
+        run_id_key, run_id_value = translate_run_key(run_id)
+        filter_dict = {
+            "flow_id": flow_id,
+            run_id_key: run_id_value,
+            "step_name": step_name,
+        }
+        conditions = [f"{k} = %s" for k, v in filter_dict.items() if v is not None]
+        values = [v for k, v in filter_dict.items() if v is not None]
+
+        if field_name:
+            conditions.append("field_name = %s")
+            values.append(field_name)
+
+        if pattern:
+            conditions.append("regexp_match(value, %s) IS NOT NULL")
+            values.append(pattern)
+
+        # We must return distinct task pathspecs, so we construct the select statement by hand
+        sql_template = """
+        SELECT DISTINCT {select_columns} FROM (
+            SELECT
+                {keys}
+            FROM {table_name}
+        ) T
+        {where}
+        {order_by}
+        """
+
+        select_sql = sql_template.format(
+            keys=",".join(self.select_columns),
+            table_name=self.table_name,
+            where="WHERE {}".format(" AND ".join(conditions)),
+            order_by="ORDER BY task_id",
+            select_columns=",".join(["flow_id, run_number, run_id, step_name, task_name, task_id"])
+        ).strip()
+
+        db_response, pagination = await self.execute_sql(select_sql=select_sql, values=values, serialize=False)
+
+        # flatten the ids in the response
+        def _format_id(row):
+            flow_id, run_number, run_id, step_name, task_name, task_id = row
+            # pathspec
+            return f"{flow_id}/{run_id or run_number}/{step_name}/{task_name or task_id}"
+
+        flattened_response = DBResponse(body=[_format_id(row) for row in db_response.body], response_code=db_response.response_code)
+        return flattened_response, pagination
+
 
 class AsyncArtifactTablePostgres(AsyncPostgresTable):
     artifact_dict = {}
diff --git a/services/metadata_service/api/task.py b/services/metadata_service/api/task.py
@@ -19,6 +19,11 @@ def __init__(self, app):
             "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/tasks",
             self.get_tasks,
         )
+        app.router.add_route(
+            "GET",
+            "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks",
+            self.get_filtered_tasks,
+        )
         app.router.add_route(
             "GET",
             "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/tasks/{task_id}",
@@ -34,6 +39,7 @@ def __init__(self, app):
                              self.tasks_heartbeat)
         self._async_table = AsyncPostgresDB.get_instance().task_table_postgres
         self._async_run_table = AsyncPostgresDB.get_instance().run_table_postgres
+        self._async_metadata_table = AsyncPostgresDB.get_instance().metadata_table_postgres
         self._db = AsyncPostgresDB.get_instance()
 
     @format_response
@@ -76,6 +82,57 @@ async def get_tasks(self, request):
         db_response = await apply_run_tags_to_db_response(flow_id, run_number, self._async_run_table, db_response)
         return db_response
 
+    @format_response
+    @handle_exceptions
+    async def get_filtered_tasks(self, request):
+        """
+        ---
+        description: get all task ids that match the provided metadata field name and/or value.
+        tags:
+        - Tasks
+        parameters:
+        - name: "flow_id"
+          in: "path"
+          description: "flow_id"
+          required: true
+          type: "string"
+        - name: "run_number"
+          in: "path"
+          description: "run_number"
+          required: true
+          type: "string"
+        - name: "step_name"
+          in: "path"
+          description: "step_name"
+          required: true
+          type: "string"
+        - name: "metadata_field_name"
+          in: "query"
+          description: "Metadata field name to filter with"
+          type: "string"
+        - name: "pattern"
+          in: "query"
+          description: "A regexp pattern to filter the metadata values on"
+          type: "string"
+        produces:
+        - text/plain
+        responses:
+            "200":
+                description: successful operation. Return tasks
+            "405":
+                description: invalid HTTP Method
+        """
+        flow_id = request.match_info.get("flow_id")
+        run_number = request.match_info.get("run_number")
+        step_name = request.match_info.get("step_name")
+
+        # possible filters
+        metadata_field = request.query.get("metadata_field_name", None)
+        pattern = request.query.get("pattern", None)
+
+        db_response, _ = await self._async_metadata_table.get_filtered_task_pathspecs(flow_id, run_number, step_name, metadata_field, pattern)
+        return db_response
+
     @format_response
     @handle_exceptions
     async def get_task(self, request):
diff --git a/services/metadata_service/api/utils.py b/services/metadata_service/api/utils.py
@@ -59,6 +59,8 @@ def handle_exceptions(func):
     async def wrapper(*args, **kwargs):
         try:
             return await func(*args, **kwargs)
+        except web.HTTPClientError as ex:
+            return ServiceResponse(ex.status_code, ex.reason)
         except Exception as err:
             return http_500(str(err))
 
diff --git a/services/metadata_service/tests/integration_tests/task_test.py b/services/metadata_service/tests/integration_tests/task_test.py
@@ -3,7 +3,7 @@
 from .utils import (
     cli, db,
     assert_api_get_response, assert_api_post_response, compare_partial,
-    add_flow, add_run, add_step, add_task, update_objects_with_run_tags
+    add_flow, add_run, add_step, add_task, add_metadata, update_objects_with_run_tags
 )
 import pytest
 
@@ -185,6 +185,99 @@ async def test_tasks_get(cli, db):
     # getting tasks for non-existent step should return empty list
     await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/nonexistent/tasks".format(**_first_task), status=200, data=[])
 
+async def test_filtered_tasks_get(cli, db):
+    # create a flow, run and step for the test
+    _flow = (await add_flow(db, "TestFlow", "test_user-1", ["a_tag", "b_tag"], ["runtime:test"])).body
+    _run = (await add_run(db, flow_id=_flow["flow_id"])).body
+    _step = (await add_step(db, flow_id=_run["flow_id"], run_number=_run["run_number"], step_name="first_step")).body
+
+    # add tasks to the step
+    _first_task = (await add_task(db, flow_id=_step["flow_id"], run_number=_step["run_number"], step_name=_step["step_name"])).body
+    _second_task = (await add_task(db, flow_id=_step["flow_id"], run_number=_step["run_number"], step_name=_step["step_name"])).body
+    _third_task = (await add_task(db, flow_id=_step["flow_id"], run_number=_step["run_number"], step_name=_step["step_name"])).body
+
+    # add metadata to filter on
+    (await add_metadata(db, flow_id=_first_task["flow_id"], run_number=_first_task["run_number"], step_name=_first_task["step_name"], task_id=_first_task["task_id"], metadata={"field_name":"field_a", "value": "value_a"}))
+    (await add_metadata(db, flow_id=_first_task["flow_id"], run_number=_first_task["run_number"], step_name=_first_task["step_name"], task_id=_first_task["task_id"], metadata={"field_name":"field_b", "value": "value_b"}))
+
+    (await add_metadata(db, flow_id=_second_task["flow_id"], run_number=_second_task["run_number"], step_name=_second_task["step_name"], task_id=_second_task["task_id"], metadata={"field_name": "field_a", "value": "not_value_a"}))
+    (await add_metadata(db, flow_id=_second_task["flow_id"], run_number=_second_task["run_number"], step_name=_second_task["step_name"], task_id=_second_task["task_id"], metadata={"field_name": "field_b", "value": "value_b"}))
+
+    # filtering with a shared key should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_a".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+
+    # filtering with a shared value should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?pattern=value_b".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+    
+    # filtering with a regexp should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?pattern=value_.*".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+
+    # filtering with a shared key&value should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_b&pattern=value_b".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+    
+    # filtering with a shared value should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_a&pattern=not_value_a".format(**_first_task),
+                                  data=[task_pathspec(_second_task)])
+    
+    # filtering with a mixed key&value should not return results
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_a&pattern=value_b".format(**_first_task),
+                                  data=[])
+    
+    # not providing filters should return all
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks".format(**_first_task), data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+
+
+async def test_filtered_tasks_mixed_ids_get(cli, db):
+    # create a flow, run and step for the test
+    _flow = (await add_flow(db, "TestFlow", "test_user-1", ["a_tag", "b_tag"], ["runtime:test"])).body
+    _run = (await add_run(db, flow_id=_flow["flow_id"])).body
+    _step = (await add_step(db, flow_id=_run["flow_id"], run_number=_run["run_number"], step_name="first_step")).body
+
+    # add tasks to the step
+    _first_task = (await add_task(db, flow_id=_step["flow_id"], run_number=_step["run_number"], step_name=_step["step_name"], task_name="first-task-1")).body
+    # we need to refetch the task as the return does not contain the internal task ID we need for further record creation.
+    _first_task = (await db.task_table_postgres.get_task(flow_id=_step["flow_id"], run_id=_step["run_number"], step_name=_step["step_name"], task_id="first-task-1", expanded=True)).body
+    _second_task = (await add_task(db, flow_id=_step["flow_id"], run_number=_step["run_number"], step_name=_step["step_name"])).body
+
+    # add metadata to filter on
+    (await add_metadata(db, flow_id=_first_task["flow_id"], run_number=_first_task["run_number"], step_name=_first_task["step_name"], task_id=_first_task['task_id'], task_name=_first_task["task_name"], metadata={"field_name":"field_a", "value": "value_a"}))
+    (await add_metadata(db, flow_id=_first_task["flow_id"], run_number=_first_task["run_number"], step_name=_first_task["step_name"], task_id=_first_task['task_id'], task_name=_first_task["task_name"], metadata={"field_name":"field_b", "value": "value_b"}))
+
+    (await add_metadata(db, flow_id=_second_task["flow_id"], run_number=_second_task["run_number"], step_name=_second_task["step_name"], task_id=_second_task["task_id"], metadata={"field_name": "field_a", "value": "not_value_a"}))
+    (await add_metadata(db, flow_id=_second_task["flow_id"], run_number=_second_task["run_number"], step_name=_second_task["step_name"], task_id=_second_task["task_id"], metadata={"field_name": "field_b", "value": "value_b"}))
+
+    # filtering with a shared key should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_a".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+
+    # filtering with a shared value should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?pattern=value_b".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+    
+    # # filtering with a regexp should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?pattern=value_.*".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+
+    # filtering with a shared key&value should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_b&pattern=value_b".format(**_first_task),
+                                  data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+    
+    # filtering with a shared value should return all relevant tasks
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_a&pattern=not_value_a".format(**_first_task),
+                                  data=[task_pathspec(_second_task)])
+    
+    # filtering with a mixed key&value should not return results
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks?metadata_field_name=field_a&pattern=value_b".format(**_first_task),
+                                  data=[])
+    
+    # not providing filters should return all
+    await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/filtered_tasks".format(**_first_task), data=[task_pathspec(_first_task), task_pathspec(_second_task)])
+
+
 
 async def test_task_get(cli, db):
     # create flow, run and step for test
@@ -206,3 +299,9 @@ async def test_task_get(cli, db):
     await assert_api_get_response(cli, "/flows/{flow_id}/runs/1234/steps/{step_name}/tasks/{task_id}".format(**_task), status=404)
     await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/nonexistent_step/tasks/{task_id}".format(**_task), status=404)
     await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}/tasks/1234".format(**_task), status=404)
+
+
+# Helpers
+
+def task_pathspec(task_dict):
+    return f"{task_dict['flow_id']}/{task_dict['run_number']}/{task_dict['step_name']}/{task_dict.get('task_name', task_dict['task_id'])}"