|
53 | 53 | from tests.cluster.controller.conftest import ( |
54 | 54 | FakeProvider, |
55 | 55 | hydrate_worker_attributes as _with_attrs, |
| 56 | + make_job_request, |
56 | 57 | query_job as _query_job, |
57 | 58 | query_job_row as _query_job_row, |
58 | 59 | query_task as _query_task, |
59 | 60 | query_task_with_attempts as _query_task_with_attempts, |
60 | 61 | query_tasks_for_job as _query_tasks_for_job, |
61 | 62 | query_worker as _query_worker, |
62 | 63 | schedulable_tasks as _schedulable_tasks, |
| 64 | + submit_job as _submit_job_tasks, |
63 | 65 | worker_running_tasks as _worker_running_tasks, |
64 | 66 | ) |
65 | 67 |
|
@@ -1436,6 +1438,49 @@ def test_holder_task_worker_death_no_failure_record(state): |
1436 | 1438 | assert task_row_can_be_scheduled(holder_task), "holder task must be schedulable again" |
1437 | 1439 |
|
1438 | 1440 |
|
| 1441 | +def test_get_running_tasks_for_poll_excludes_reservation_holders(state): |
| 1442 | + """get_running_tasks_for_poll must filter reservation-holder tasks. |
| 1443 | +
|
| 1444 | + Regression: the ping/poll loop feeds its output directly into |
| 1445 | + PollTasksRequest.expected_tasks. Holders are virtual — they never reach |
| 1446 | + the worker's _tasks dict — so including them makes the worker reconcile, |
| 1447 | + miss, and return WORKER_FAILED("Task not found on worker") every cycle. |
| 1448 | + That drains the holder's preemption budget and (with the ASSIGNED→ |
| 1449 | + WORKER_FAILED health hook) reaps the claimed worker every few minutes. |
| 1450 | +
|
| 1451 | + Produced observed ~51 attempts/hour per holder in production. |
| 1452 | + """ |
| 1453 | + request = _make_job_request_with_reservation( |
| 1454 | + reservation_entries=[_make_reservation_entry(_cpu_device())], |
| 1455 | + ) |
| 1456 | + parent_job_id = _submit_job(state, "res-job", request) |
| 1457 | + holder_job_id = parent_job_id.child(RESERVATION_HOLDER_JOB_NAME) |
| 1458 | + |
| 1459 | + holder_tasks = _query_tasks_for_job(state, holder_job_id) |
| 1460 | + assert len(holder_tasks) == 1 |
| 1461 | + holder_task = holder_tasks[0] |
| 1462 | + |
| 1463 | + real_request = make_job_request("real-job") |
| 1464 | + (real_task,) = _submit_job_tasks(state, "real-job", real_request) |
| 1465 | + |
| 1466 | + worker_id = _register_worker(state, "w1") |
| 1467 | + state.queue_assignments( |
| 1468 | + [ |
| 1469 | + Assignment(task_id=holder_task.task_id, worker_id=worker_id), |
| 1470 | + Assignment(task_id=real_task.task_id, worker_id=worker_id), |
| 1471 | + ] |
| 1472 | + ) |
| 1473 | + |
| 1474 | + running, _addresses = state.get_running_tasks_for_poll() |
| 1475 | + |
| 1476 | + task_ids = {entry.task_id for entry in running.get(worker_id, [])} |
| 1477 | + assert real_task.task_id in task_ids, "real task must still appear for polling" |
| 1478 | + assert holder_task.task_id not in task_ids, ( |
| 1479 | + "reservation holder must be excluded — worker has no in-memory state " |
| 1480 | + "for virtual holders, so polling them produces bogus WORKER_FAILEDs" |
| 1481 | + ) |
| 1482 | + |
| 1483 | + |
1439 | 1484 | def test_holder_task_removed_from_worker_when_parent_succeeds(state): |
1440 | 1485 | """Holder task is cleaned from worker.running_tasks when the parent job succeeds. |
1441 | 1486 |
|
|
0 commit comments