Skip to content

Commit b1605da

Browse files
committed
fix: improve error handling on task creation failure
This prints the label of the task we failed to create for convenience, as well as defers failing the Decision task until after we've attempted all tasks. This way we'll see all the scope errors at once rather than needing to fix them one at a time.
1 parent ad24419 commit b1605da

File tree

2 files changed

+85
-13
lines changed

2 files changed

+85
-13
lines changed

src/taskgraph/create.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,18 @@
2020
testing = False
2121

2222

23+
class CreateTasksException(Exception):
24+
"""Exception raised when one or more tasks could not be created."""
25+
26+
def __init__(self, errors: dict[str, Exception]):
27+
message = ""
28+
for label, exc in errors.items():
29+
message += f"\nERROR: Could not create '{label}':\n\n"
30+
message += "\n".join(f" {line}" for line in str(exc).splitlines()) + "\n"
31+
32+
super().__init__(message)
33+
34+
2335
def create_tasks(graph_config, taskgraph, label_to_taskid, params, decision_task_id):
2436
taskid_to_label = {t: l for l, t in label_to_taskid.items()}
2537

@@ -50,33 +62,48 @@ def create_tasks(graph_config, taskgraph, label_to_taskid, params, decision_task
5062
session = get_session()
5163
with futures.ThreadPoolExecutor(concurrency) as e:
5264
fs = {}
65+
fs_to_task = {}
66+
skipped = set()
67+
errors = {}
5368

5469
# We can't submit a task until its dependencies have been submitted.
5570
# So our strategy is to walk the graph and submit tasks once all
5671
# their dependencies have been submitted.
5772
tasklist = set(taskgraph.graph.visit_postorder())
5873
alltasks = tasklist.copy()
5974

60-
def schedule_tasks():
61-
# bail out early if any futures have failed
62-
if any(f.done() and f.exception() for f in fs.values()):
63-
return
75+
def handle_exception(fut):
76+
if exc := fut.exception():
77+
task_id, label = fs_to_task[fut]
78+
skipped.add(task_id)
79+
errors[label] = exc
6480

81+
def schedule_tasks():
6582
to_remove = set()
6683
new = set()
6784

6885
def submit(task_id, label, task_def):
6986
fut = e.submit(create_task, session, task_id, label, task_def)
7087
new.add(fut)
7188
fs[task_id] = fut
89+
fs_to_task[fut] = (task_id, label)
90+
fut.add_done_callback(handle_exception)
7291

7392
for task_id in tasklist:
7493
task_def = taskgraph.tasks[task_id].task
75-
# If we haven't finished submitting all our dependencies yet,
76-
# come back to this later.
7794
# Some dependencies aren't in our graph, so make sure to filter
7895
# those out
7996
deps = set(task_def.get("dependencies", [])) & alltasks
97+
98+
# If one of the dependencies didn't get created, then
99+
# don't attempt to submit as it would fail.
100+
if any(d in skipped for d in deps):
101+
skipped.add(task_id)
102+
to_remove.add(task_id)
103+
continue
104+
105+
# If we haven't finished submitting all our dependencies yet,
106+
# come back to this later.
80107
if any((d not in fs or not fs[d].done()) for d in deps):
81108
continue
82109

@@ -90,16 +117,18 @@ def submit(task_id, label, task_def):
90117
submit(slugid(), taskid_to_label[task_id], task_def)
91118
tasklist.difference_update(to_remove)
92119

93-
# as each of those futures complete, try to schedule more tasks
120+
# As each of those futures complete, try to schedule more tasks.
94121
for f in futures.as_completed(new):
95122
schedule_tasks()
96123

97-
# start scheduling tasks and run until everything is scheduled
124+
# Start scheduling tasks and run until everything is scheduled.
98125
schedule_tasks()
99126

100-
# check the result of each future, raising an exception if it failed
101-
for f in futures.as_completed(fs.values()):
102-
f.result()
127+
# Wait for all futures to complete.
128+
futures.wait(fs.values())
129+
130+
if errors:
131+
raise CreateTasksException(errors)
103132

104133

105134
def create_task(session, task_id, label, task_def):

test/test_create.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
from unittest import mock
99

1010
import responses
11-
from taskcluster.exceptions import TaskclusterRestFailure
1211

1312
from taskgraph import create
1413
from taskgraph.config import GraphConfig
14+
from taskgraph.create import CreateTasksException
1515
from taskgraph.graph import Graph
1616
from taskgraph.task import Task
1717
from taskgraph.taskgraph import TaskGraph
@@ -151,11 +151,54 @@ def test_create_tasks_fails_if_create_fails(self):
151151
graph = Graph(nodes={"tid-a"}, edges=set())
152152
taskgraph = TaskGraph(tasks, graph)
153153

154-
with self.assertRaises(TaskclusterRestFailure):
154+
with self.assertRaises(CreateTasksException):
155155
create.create_tasks(
156156
GRAPH_CONFIG,
157157
taskgraph,
158158
label_to_taskid,
159159
{"level": "4"},
160160
decision_task_id="decisiontask",
161161
)
162+
163+
@responses.activate
164+
@mock.patch.dict(
165+
"os.environ",
166+
{"TASKCLUSTER_ROOT_URL": "https://tc.example.com"},
167+
clear=True,
168+
)
169+
def test_create_tasks_collects_multiple_errors(self):
170+
"create_tasks collects all errors from multiple failing tasks"
171+
mock_taskcluster_api(
172+
error_status=409,
173+
error_message={
174+
"tid-a": "scope error for task a",
175+
"tid-b": "scope error for task b",
176+
},
177+
error_task_ids={"tid-a", "tid-b"},
178+
)
179+
180+
tasks = {
181+
"tid-a": Task(
182+
kind="test", label="a", attributes={}, task={"payload": "hello world"}
183+
),
184+
"tid-b": Task(
185+
kind="test", label="b", attributes={}, task={"payload": "hello world"}
186+
),
187+
}
188+
label_to_taskid = {"a": "tid-a", "b": "tid-b"}
189+
graph = Graph(nodes={"tid-a", "tid-b"}, edges=set())
190+
taskgraph = TaskGraph(tasks, graph)
191+
192+
with self.assertRaises(CreateTasksException) as cm:
193+
create.create_tasks(
194+
GRAPH_CONFIG,
195+
taskgraph,
196+
label_to_taskid,
197+
{"level": "4"},
198+
decision_task_id="decisiontask",
199+
)
200+
201+
# Verify both errors are in the exception message
202+
exception_message = str(cm.exception)
203+
self.assertIn("Could not create 'a'", exception_message)
204+
self.assertIn("Could not create 'b'", exception_message)

0 commit comments

Comments
 (0)