Skip to content

Commit ec666a3

Browse files
committed
fix(KONFLUX-13012): clean up orphan IRs before creating new one
When the managed task is retried (e.g. due to timeout), the previous InternalRequest and its PipelineRun on the internal cluster continue running. The retry creates a new InternalRequest, leading to concurrent operations that cause duplicate Pulp pushes and CGW "Record already present" errors. Before creating a new InternalRequest, the script now deletes any existing InternalRequests with the same PipelineRun UID label. Combined with the finalizer change in internal-services(PR 709), this ensures the associated PipelineRun is cancelled before the new one starts. Assisted-by: Cursor AI Signed-off-by: Scott Wickersham <swickers@redhat.com>
1 parent fbf0f92 commit ec666a3

2 files changed

Lines changed: 237 additions & 1 deletion

File tree

utils/internal-request

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
# Note: This script is intended to be used with a specific Kubernetes API
4949
# that includes the 'InternalRequest' resource type.
5050

51-
set -e
51+
set -eo pipefail
5252

5353
# Set defaults
5454
SYNC=true
@@ -258,12 +258,64 @@ if [[ -n ${LABELS[@]} ]]; then
258258
--argjson labels "$LABEL_JSON" \
259259
'.metadata.labels += $labels' <<< $PAYLOAD)
260260
fi
261+
# Always stamp the pipeline name onto the IR so the cleanup selector can scope
262+
# deletions to this specific pipeline, preventing accidental removal of IRs
263+
# created by other tasks running in parallel within the same PipelineRun.
264+
PIPELINE_NAME_LABEL="internal-services.appstudio.openshift.io/pipeline-name"
265+
PAYLOAD=$(jq \
266+
--arg key "$PIPELINE_NAME_LABEL" \
267+
--arg value "$PIPELINE" \
268+
'.metadata.labels += {($key): $value}' <<< $PAYLOAD)
261269
if [[ -n "${SERVICEACCOUNT}" ]]; then
262270
PAYLOAD=$(jq \
263271
--arg serviceAccount "$SERVICEACCOUNT" \
264272
'.spec.serviceAccount = $serviceAccount' <<< $PAYLOAD)
265273
fi
266274

275+
# Clean up any existing InternalRequests from prior attempts within the same
276+
# PipelineRun. This handles cases where a managed task retry or timeout left
277+
# an orphaned InternalRequest whose PipelineRun may still be running on the
278+
# internal cluster. The internal-services controller will cancel the associated
279+
# PipelineRun when it detects the InternalRequest deletion.
280+
#
281+
# The selector uses both pipelinerun-uid (supplied by the caller) and the
282+
# pipeline-name that was auto-stamped on the IR above. Combining them scopes
283+
# the cleanup to only IRs from this specific pipeline invocation, so parallel
284+
# tasks in the same PipelineRun that call different pipelines are never
285+
# accidentally deleted.
286+
PIPELINERUN_UID_LABEL="internal-services.appstudio.openshift.io/pipelinerun-uid"
287+
PIPELINERUN_UID_VALUE=""
288+
for label in "${LABELS[@]}"; do
289+
KEY=$(echo "$label" | cut -d'=' -f1)
290+
VALUE=$(echo "$label" | cut -d'=' -f2-)
291+
if [ "$KEY" = "$PIPELINERUN_UID_LABEL" ]; then
292+
PIPELINERUN_UID_VALUE="$VALUE"
293+
break
294+
fi
295+
done
296+
297+
if [ -n "$PIPELINERUN_UID_VALUE" ]; then
298+
LABEL_SELECTOR="${PIPELINERUN_UID_LABEL}=${PIPELINERUN_UID_VALUE},${PIPELINE_NAME_LABEL}=${PIPELINE}"
299+
EXISTING_IRS=$(kubectl get internalrequest \
300+
-l "$LABEL_SELECTOR" \
301+
-o json | jq -r '.items[].metadata.name')
302+
303+
if [ -n "$EXISTING_IRS" ]; then
304+
echo "Found existing InternalRequests from prior attempts. Cleaning up..."
305+
while IFS= read -r IR_NAME; do
306+
echo "Deleting InternalRequest ${IR_NAME}..."
307+
kubectl delete internalrequest "$IR_NAME" --wait=true --timeout=60s
308+
done <<< "$EXISTING_IRS"
309+
# Sleep to give the internal-services controller time to react to the IR
310+
# deletion and cancel the associated PipelineRun before we create a new one.
311+
# This is a best-effort heuristic; --wait=true above ensures the IR is gone
312+
# from the API server, but cancellation propagation to the internal cluster
313+
# is asynchronous and its duration is not observable from here.
314+
echo "Cleanup complete. Waiting for PipelineRun cancellation to propagate..."
315+
sleep 5
316+
fi
317+
fi
318+
267319
# Create InternalRequest using kubectl
268320
RESOURCE=$(echo "$PAYLOAD" | kubectl create -f - -o json)
269321
INTERNAL_REQUEST_NAME=$(echo "$RESOURCE" | jq -r '.metadata.name')
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""Tests for the internal-request utility script."""
2+
3+
from __future__ import annotations
4+
5+
import os
6+
import stat
7+
import subprocess
8+
from pathlib import Path
9+
10+
SCRIPT_PATH = Path(__file__).resolve().parents[1] / "internal-request"
11+
12+
13+
def _write_executable(path: Path, content: str) -> None:
14+
path.write_text(content, encoding="utf-8")
15+
path.chmod(path.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
16+
17+
18+
def _run_internal_request(
19+
tmp_path: Path,
20+
labels: list[str] | None = None,
21+
existing_irs: list[str] | None = None,
22+
delete_fails: bool = False,
23+
):
24+
bin_dir = tmp_path / "bin"
25+
bin_dir.mkdir()
26+
kubectl_log = tmp_path / "kubectl.log"
27+
sleep_log = tmp_path / "sleep.log"
28+
29+
# Build a JSON array of IR objects for the mock get response.
30+
ir_names = existing_irs or []
31+
items_json = ", ".join(f'{{"metadata": {{"name": "{name}"}}}}' for name in ir_names)
32+
mock_get_json = f'{{"items": [{items_json}]}}'
33+
34+
delete_exit = "1" if delete_fails else "0"
35+
36+
_write_executable(
37+
bin_dir / "kubectl",
38+
f"""#!/usr/bin/env bash
39+
set -euo pipefail
40+
echo "$*" >> "$KUBECTL_LOG"
41+
if [[ "${{1:-}}" == "get" && "${{2:-}}" == "internalrequest" ]]; then
42+
echo '{mock_get_json}'
43+
exit 0
44+
fi
45+
if [[ "${{1:-}}" == "delete" && "${{2:-}}" == "internalrequest" ]]; then
46+
exit {delete_exit}
47+
fi
48+
if [[ "${{1:-}}" == "create" ]]; then
49+
cat >/dev/null
50+
echo '{{"metadata":{{"name":"new-ir"}}}}'
51+
exit 0
52+
fi
53+
exit 1
54+
""",
55+
)
56+
57+
_write_executable(
58+
bin_dir / "sleep",
59+
"""#!/usr/bin/env bash
60+
set -euo pipefail
61+
echo "$*" >> "$SLEEP_LOG"
62+
""",
63+
)
64+
65+
cmd = [
66+
"bash",
67+
str(SCRIPT_PATH),
68+
"--pipeline",
69+
"test-pipeline",
70+
"-p",
71+
"taskGitUrl=https://github.com/konflux-ci/release-service-catalog",
72+
"-p",
73+
"taskGitRevision=main",
74+
"-s",
75+
"false",
76+
]
77+
for label in labels or []:
78+
cmd.extend(["-l", label])
79+
80+
env = os.environ.copy()
81+
env["PATH"] = f"{bin_dir}:{env['PATH']}"
82+
env["KUBECTL_LOG"] = str(kubectl_log)
83+
env["SLEEP_LOG"] = str(sleep_log)
84+
85+
result = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)
86+
87+
kubectl_calls = kubectl_log.read_text(encoding="utf-8").splitlines()
88+
sleep_calls = (
89+
sleep_log.read_text(encoding="utf-8").splitlines() if sleep_log.exists() else []
90+
)
91+
return result, kubectl_calls, sleep_calls
92+
93+
94+
PIPELINERUN_UID_LABEL = "internal-services.appstudio.openshift.io/pipelinerun-uid"
95+
PIPELINE_NAME_LABEL = "internal-services.appstudio.openshift.io/pipeline-name"
96+
# The test helper always passes --pipeline test-pipeline
97+
TEST_PIPELINE = "test-pipeline"
98+
99+
100+
def test_internal_request_cleans_up_existing_requests(tmp_path):
101+
"""Delete existing IRs and sleep before creating a new one."""
102+
result, kubectl_calls, sleep_calls = _run_internal_request(
103+
tmp_path=tmp_path,
104+
labels=[f"{PIPELINERUN_UID_LABEL}=uid-123"],
105+
existing_irs=["old-ir-1", "old-ir-2"],
106+
)
107+
108+
assert result.returncode == 0, result.stderr
109+
selector = (
110+
f"get internalrequest -l "
111+
f"{PIPELINERUN_UID_LABEL}=uid-123,{PIPELINE_NAME_LABEL}={TEST_PIPELINE}"
112+
)
113+
assert any(call.startswith(selector) for call in kubectl_calls)
114+
assert any(c.startswith("delete internalrequest old-ir-1") for c in kubectl_calls)
115+
assert any(c.startswith("delete internalrequest old-ir-2") for c in kubectl_calls)
116+
assert "5" in sleep_calls
117+
assert any(call.startswith("create -f - -o json") for call in kubectl_calls)
118+
119+
120+
def test_internal_request_skips_cleanup_when_no_existing_requests(tmp_path):
121+
"""Skip delete and sleep when no existing IRs match the selector."""
122+
result, kubectl_calls, sleep_calls = _run_internal_request(
123+
tmp_path=tmp_path,
124+
labels=[f"{PIPELINERUN_UID_LABEL}=uid-123"],
125+
existing_irs=[],
126+
)
127+
128+
assert result.returncode == 0, result.stderr
129+
assert any(call.startswith("get internalrequest -l") for call in kubectl_calls)
130+
assert not any(call.startswith("delete internalrequest") for call in kubectl_calls)
131+
assert sleep_calls == []
132+
assert any(call.startswith("create -f - -o json") for call in kubectl_calls)
133+
134+
135+
def test_internal_request_skips_cleanup_without_pipelinerun_uid_label(tmp_path):
136+
"""Skip cleanup when the pipelinerun-uid label is absent from the IR labels."""
137+
result, kubectl_calls, sleep_calls = _run_internal_request(
138+
tmp_path=tmp_path,
139+
labels=["some-other-label=foo"],
140+
existing_irs=["old-ir-1"],
141+
)
142+
143+
assert result.returncode == 0, result.stderr
144+
assert not any(call.startswith("get internalrequest -l") for call in kubectl_calls)
145+
assert not any(call.startswith("delete internalrequest") for call in kubectl_calls)
146+
assert sleep_calls == []
147+
assert any(call.startswith("create -f - -o json") for call in kubectl_calls)
148+
149+
150+
def test_internal_request_does_not_delete_parallel_task_irs(tmp_path):
151+
"""Include pipeline-name in the selector to avoid matching IRs from other parallel tasks.
152+
153+
IRs that call a different --pipeline must never be deleted.
154+
"""
155+
result, kubectl_calls, sleep_calls = _run_internal_request(
156+
tmp_path=tmp_path,
157+
labels=[f"{PIPELINERUN_UID_LABEL}=uid-123"],
158+
existing_irs=["old-ir-1"],
159+
)
160+
161+
assert result.returncode == 0, result.stderr
162+
selector = (
163+
f"get internalrequest -l "
164+
f"{PIPELINERUN_UID_LABEL}=uid-123,{PIPELINE_NAME_LABEL}={TEST_PIPELINE}"
165+
)
166+
assert any(call.startswith(selector) for call in kubectl_calls)
167+
assert not any(
168+
call == f"get internalrequest -l {PIPELINERUN_UID_LABEL}=uid-123 -o json"
169+
for call in kubectl_calls
170+
), "Selector must not use pipelinerun-uid alone"
171+
172+
173+
def test_internal_request_fails_when_delete_fails(tmp_path):
174+
"""Exit non-zero and skip IR creation when deletion of an existing IR fails."""
175+
result, kubectl_calls, sleep_calls = _run_internal_request(
176+
tmp_path=tmp_path,
177+
labels=[f"{PIPELINERUN_UID_LABEL}=uid-123"],
178+
existing_irs=["old-ir-1"],
179+
delete_fails=True,
180+
)
181+
182+
assert result.returncode != 0, "Expected non-zero exit when delete fails"
183+
assert any(c.startswith("delete internalrequest old-ir-1") for c in kubectl_calls)
184+
assert not any(call.startswith("create -f - -o json") for call in kubectl_calls)

0 commit comments

Comments
 (0)