Skip to content

Commit b2ee04c

Browse files
Avoid race condition when reporting job statuses after retriggering (#3121)
Avoid race condition when reporting job statuses after retriggering Retriggering starts a new job run that shares the same status check with the one that is about to be canceled. Cancel request is sent to Copr/Koji/TF, status check is updated by the new job run, but then a build end event or TF webhook arrives and the status check is updated with a wrong status (canceled in this case, but this could happen on e.g. a build failure as well). Prevent this by checking if there is a newer run of the same job before reporting status in the handlers. Reviewed-by: gemini-code-assist[bot] Reviewed-by: Maja Massarini
2 parents 8c3cf40 + 784c97c commit b2ee04c

8 files changed

Lines changed: 322 additions & 21 deletions

File tree

packit_service/models.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2374,6 +2374,34 @@ def get_by_id(cls, id_: int) -> Optional["CoprBuildTargetModel"]:
23742374
with sa_session_transaction() as session:
23752375
return session.query(CoprBuildTargetModel).filter_by(id=id_).first()
23762376

2377+
@classmethod
2378+
def has_newer_run(cls, run: "CoprBuildTargetModel") -> bool:
2379+
"""Check if a newer build exists for the same target+identifier
2380+
on the same project object (e.g. same PR).
2381+
2382+
Used to avoid overwriting a newer build's check status when
2383+
processing results for an older (e.g. canceled) build.
2384+
"""
2385+
project_event = run.get_project_event_model()
2386+
if not project_event or not run.submitted_time:
2387+
return False
2388+
with sa_session_transaction() as session:
2389+
return session.query(
2390+
session.query(CoprBuildTargetModel)
2391+
.join(CoprBuildGroupModel)
2392+
.join(PipelineModel)
2393+
.join(ProjectEventModel)
2394+
.filter(
2395+
ProjectEventModel.type == project_event.type,
2396+
ProjectEventModel.event_id == project_event.event_id,
2397+
CoprBuildTargetModel.identifier == run.identifier,
2398+
CoprBuildTargetModel.target == run.target,
2399+
CoprBuildTargetModel.submitted_time > run.submitted_time,
2400+
CoprBuildTargetModel.id != run.id,
2401+
)
2402+
.exists()
2403+
).scalar()
2404+
23772405
@classmethod
23782406
def get_all(cls) -> Iterable["CoprBuildTargetModel"]:
23792407
with sa_session_transaction() as session:
@@ -3014,6 +3042,33 @@ def get_by_id(cls, id_: int) -> Optional["KojiBuildTargetModel"]:
30143042
with sa_session_transaction() as session:
30153043
return session.query(KojiBuildTargetModel).filter_by(id=id_).first()
30163044

3045+
@classmethod
3046+
def has_newer_run(cls, run: "KojiBuildTargetModel") -> bool:
3047+
"""Check if a newer build exists for the same target
3048+
on the same project object (e.g. same PR).
3049+
3050+
Used to avoid overwriting a newer build's check status when
3051+
processing results for an older (e.g. canceled) build.
3052+
"""
3053+
project_event = run.get_project_event_model()
3054+
if not project_event or not run.submitted_time:
3055+
return False
3056+
with sa_session_transaction() as session:
3057+
return session.query(
3058+
session.query(KojiBuildTargetModel)
3059+
.join(KojiBuildGroupModel)
3060+
.join(PipelineModel)
3061+
.join(ProjectEventModel)
3062+
.filter(
3063+
ProjectEventModel.type == project_event.type,
3064+
ProjectEventModel.event_id == project_event.event_id,
3065+
KojiBuildTargetModel.target == run.target,
3066+
KojiBuildTargetModel.submitted_time > run.submitted_time,
3067+
KojiBuildTargetModel.id != run.id,
3068+
)
3069+
.exists()
3070+
).scalar()
3071+
30173072
@classmethod
30183073
def get_all(cls) -> Iterable["KojiBuildTargetModel"]:
30193074
with sa_session_transaction() as session:
@@ -3855,6 +3910,34 @@ def get_by_pipeline_id(cls, pipeline_id: str) -> Optional["TFTTestRunTargetModel
38553910
with sa_session_transaction() as session:
38563911
return session.query(TFTTestRunTargetModel).filter_by(pipeline_id=pipeline_id).first()
38573912

3913+
@classmethod
3914+
def has_newer_run(cls, run: "TFTTestRunTargetModel") -> bool:
3915+
"""Check if a newer test run exists for the same target+identifier
3916+
on the same project object (e.g. same PR).
3917+
3918+
Used to avoid overwriting a newer run's check status when
3919+
processing results for an older (e.g. canceled) run.
3920+
"""
3921+
project_event = run.get_project_event_model()
3922+
if not project_event or not run.submitted_time:
3923+
return False
3924+
with sa_session_transaction() as session:
3925+
return session.query(
3926+
session.query(TFTTestRunTargetModel)
3927+
.join(TFTTestRunGroupModel)
3928+
.join(PipelineModel)
3929+
.join(ProjectEventModel)
3930+
.filter(
3931+
ProjectEventModel.type == project_event.type,
3932+
ProjectEventModel.event_id == project_event.event_id,
3933+
TFTTestRunTargetModel.identifier == run.identifier,
3934+
TFTTestRunTargetModel.target == run.target,
3935+
TFTTestRunTargetModel.submitted_time > run.submitted_time,
3936+
TFTTestRunTargetModel.id != run.id,
3937+
)
3938+
.exists()
3939+
).scalar()
3940+
38583941
@classmethod
38593942
def get_all_by_status(
38603943
cls,

packit_service/worker/handlers/copr.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def _run(self) -> TaskResults:
316316
if self.build.status in [
317317
BuildStatus.success,
318318
BuildStatus.failure,
319+
BuildStatus.canceled,
319320
]:
320321
msg = (
321322
f"Copr build {self.copr_event.build_id} is already"
@@ -346,18 +347,25 @@ def _run(self) -> TaskResults:
346347
packit_dashboard_url = get_copr_build_info_url(self.build.id)
347348
# if SRPM build failed it has been reported already so skip reporting
348349
if self.build.get_srpm_build().status != BuildStatus.failure:
349-
self.copr_build_helper.report_status_to_all_for_chroot(
350-
state=BaseCommitStatus.failure,
351-
description=failed_msg,
352-
url=packit_dashboard_url,
353-
chroot=self.copr_event.chroot,
354-
)
355-
self.measure_time_after_reporting()
356-
self.copr_build_helper.notify_about_failure_if_configured(
357-
packit_dashboard_url=packit_dashboard_url,
358-
external_dashboard_url=self.build.web_url,
359-
logs_url=self.build.build_logs_url,
360-
)
350+
if CoprBuildTargetModel.has_newer_run(self.build):
351+
logger.info(
352+
f"Skipping status report for Copr build "
353+
f"{self.copr_event.build_id} ({self.build.target}): "
354+
f"a newer build exists for the same target."
355+
)
356+
else:
357+
self.copr_build_helper.report_status_to_all_for_chroot(
358+
state=BaseCommitStatus.failure,
359+
description=failed_msg,
360+
url=packit_dashboard_url,
361+
chroot=self.copr_event.chroot,
362+
)
363+
self.measure_time_after_reporting()
364+
self.copr_build_helper.notify_about_failure_if_configured(
365+
packit_dashboard_url=packit_dashboard_url,
366+
external_dashboard_url=self.build.web_url,
367+
logs_url=self.build.build_logs_url,
368+
)
361369
self.build.set_status(BuildStatus.failure)
362370
report_long_runtime("Copr build failed end", 120, run_start_time)
363371
return TaskResults(success=False, details={"msg": failed_msg})

packit_service/worker/handlers/koji.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -262,26 +262,34 @@ def _run(self) -> TaskResults:
262262
else:
263263
self.push_metrics()
264264
self.build.set_status(new_commit_status.value)
265-
self.report(description, new_commit_status, dashboard_url, koji_web_url)
266265
koji_build_logs = self.koji_task_event.get_koji_build_rpm_tasks_logs_urls(
267266
self.service_config.koji_logs_url,
268267
)
269-
270268
self.build.set_build_logs_urls(koji_build_logs)
271269
self.build.set_web_url(koji_web_url)
272270

271+
newer_run_exists = KojiBuildTargetModel.has_newer_run(self.build)
272+
if newer_run_exists:
273+
logger.info(
274+
f"Skipping status report for Koji build "
275+
f"{self.koji_task_event.task_id} ({self.build.target}): "
276+
f"a newer build exists for the same target."
277+
)
278+
else:
279+
self.report(description, new_commit_status, dashboard_url, koji_web_url)
280+
273281
if self.koji_task_event.state == KojiTaskState.failed:
274282
logger.info(
275283
f"Failed Koji scratch build (parent taskID = {self.koji_task_event.task_id})"
276284
)
277285
self.trigger_log_detective_if_configured()
278-
# Convert dict of logs URLs to a string representation
279-
logs_url_str = ", ".join(koji_build_logs.values()) if koji_build_logs else ""
280-
self.notify_about_failure_if_configured(
281-
packit_dashboard_url=dashboard_url,
282-
external_dashboard_url=koji_web_url,
283-
logs_url=logs_url_str,
284-
)
286+
if not newer_run_exists:
287+
logs_url_str = ", ".join(koji_build_logs.values()) if koji_build_logs else ""
288+
self.notify_about_failure_if_configured(
289+
packit_dashboard_url=dashboard_url,
290+
external_dashboard_url=koji_web_url,
291+
logs_url=logs_url_str,
292+
)
285293

286294
msg = (
287295
f"Build on {self.build.target} in koji changed state "

packit_service/worker/handlers/testing_farm.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,15 @@ def _run(self) -> TaskResults:
712712
)
713713
self.pushgateway.test_run_finished_time.observe(test_run_time)
714714

715+
if TFTTestRunTargetModel.has_newer_run(test_run_model):
716+
logger.info(
717+
f"Skipping status report for TF run {self.pipeline_id} "
718+
f"({test_run_model.target}/{test_run_model.identifier}): "
719+
f"a newer run exists for the same target."
720+
)
721+
test_run_model.set_status(self.result, created=self.created)
722+
return TaskResults(success=True, details={})
723+
715724
test_run_model.set_web_url(self.log_url)
716725
url = get_testing_farm_info_url(test_run_model.id) if test_run_model else None
717726
self.testing_farm_job_helper.report_status_to_tests_for_test_target(

tests/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def __init__(self, **kwargs):
239239
},
240240
],
241241
task_accepted_time=datetime.now(),
242+
submitted_time=datetime.now(),
242243
build_start_time=None,
243244
build_logs_url="https://log-url",
244245
)
@@ -316,6 +317,7 @@ def koji_build_pr():
316317
web_url="https://some-url",
317318
target="some-target",
318319
status="some-status",
320+
submitted_time=datetime.now(),
319321
runs=runs,
320322
)
321323
koji_build_model._srpm_build_for_mocking = srpm_build

tests/integration/test_listen_to_fedmsg.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,7 @@ def test_copr_build_end_testing_farm_manual_trigger(
12071207
copr_build_pr,
12081208
)
12091209
flexmock(CoprBuildTargetModel).should_receive("get_by_id").and_return(copr_build_pr)
1210+
flexmock(CoprBuildTargetModel).should_receive("has_newer_run").and_return(False)
12101211
copr_build_pr.should_call("set_status").with_args(build_status).once()
12111212
copr_build_pr.should_receive("set_end_time").once()
12121213
copr_build_pr.should_receive("get_package_name").and_return(None)
@@ -2504,6 +2505,7 @@ def test_koji_build_start(koji_build_scratch_start, pc_koji_build_pr, koji_build
25042505
flexmock(KojiBuildTargetModel).should_receive("get_by_task_id").and_return(
25052506
koji_build_pr,
25062507
)
2508+
flexmock(KojiBuildTargetModel).should_receive("has_newer_run").and_return(False)
25072509
url = get_koji_build_info_url(1)
25082510
flexmock(requests).should_receive("get").and_return(requests.Response())
25092511
flexmock(requests.Response).should_receive("raise_for_status").and_return(None)
@@ -2566,6 +2568,7 @@ def test_koji_build_end(koji_build_scratch_end, pc_koji_build_pr, koji_build_pr)
25662568
flexmock(KojiBuildTargetModel).should_receive("get_by_task_id").and_return(
25672569
koji_build_pr,
25682570
)
2571+
flexmock(KojiBuildTargetModel).should_receive("has_newer_run").and_return(False)
25692572
url = get_koji_build_info_url(1)
25702573
flexmock(requests).should_receive("get").and_return(requests.Response())
25712574
flexmock(requests.Response).should_receive("raise_for_status").and_return(None)

tests/unit/test_copr_build.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,3 +1109,68 @@ def test_get_running_jobs_rebuild_failed_passes_targets(github_pr_event):
11091109
targets={"fedora-rawhide-x86_64", "fedora-40-x86_64"},
11101110
).and_return(sentinel).once()
11111111
assert list(helper.get_running_jobs()) == sentinel
1112+
1113+
1114+
def test_copr_build_end_skips_already_canceled():
1115+
from packit_service.worker.handlers.copr import CoprBuildEndHandler
1116+
1117+
handler = flexmock(
1118+
build=flexmock(status=BuildStatus.canceled),
1119+
copr_event=flexmock(
1120+
build_id="12345",
1121+
chroot="fedora-rawhide-x86_64",
1122+
build=flexmock(status=BuildStatus.canceled),
1123+
),
1124+
)
1125+
result = CoprBuildEndHandler._run(handler)
1126+
assert result["success"]
1127+
1128+
1129+
def test_copr_build_end_skips_reporting_for_superseded_build():
1130+
from packit_service.worker.handlers.copr import CoprBuildEndHandler
1131+
1132+
srpm_build = flexmock(status=BuildStatus.success, url="https://some.url/my.srpm")
1133+
build = flexmock(
1134+
id=1,
1135+
status=BuildStatus.pending,
1136+
target="fedora-rawhide-x86_64",
1137+
task_accepted_time=None,
1138+
web_url="https://copr.url/build/1",
1139+
build_logs_url="https://copr.url/logs/1",
1140+
build_id="12345",
1141+
)
1142+
build.should_receive("get_srpm_build").and_return(srpm_build)
1143+
build.should_receive("set_status").with_args(BuildStatus.failure).once()
1144+
1145+
copr_event = flexmock(
1146+
build_id="12345",
1147+
chroot="fedora-rawhide-x86_64",
1148+
status=0,
1149+
timestamp=None,
1150+
)
1151+
1152+
copr_build_helper = flexmock()
1153+
copr_build_helper.should_receive("report_status_to_all_for_chroot").never()
1154+
copr_build_helper.should_receive("notify_about_failure_if_configured").never()
1155+
1156+
pushgateway = flexmock(
1157+
copr_builds_finished=flexmock(inc=lambda: None),
1158+
)
1159+
pushgateway.should_receive("push").and_return()
1160+
1161+
handler = flexmock(
1162+
build=build,
1163+
copr_event=copr_event,
1164+
copr_build_helper=copr_build_helper,
1165+
pushgateway=pushgateway,
1166+
)
1167+
handler.should_receive("set_end_time").once()
1168+
handler.should_receive("set_srpm_url").once()
1169+
1170+
flexmock(CoprBuildTargetModel).should_receive("has_newer_run").with_args(build).and_return(
1171+
True,
1172+
).once()
1173+
1174+
result = CoprBuildEndHandler._run(handler)
1175+
assert not result["success"]
1176+
assert result["details"]["msg"] == "RPMs failed to be built."

0 commit comments

Comments
 (0)