Skip to content

Commit ebb5317

Browse files
committed
fix: stabilize periodic e2e test execution under parallel load
Stabilize periodic e2e: 6Gi memory, cap 9 parallel suites, retry component init Signed-off-by: Elena German <elgerman@redhat.com> Assisted-by: Claude
1 parent 669cb85 commit ebb5317

3 files changed

Lines changed: 100 additions & 29 deletions

File tree

integration-tests/README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,13 @@ These integration tests are automatically executed in CI/CD pipelines:
173173
- **Pull Request Triggers** - Tests run when changes are made to relevant pipeline files or the `integration-tests/` directory
174174
- **E2E Pipeline** - Uses `integration-tests/pipelines/e2e-tests-staging-pipeline.yaml`
175175
- **Konflux E2E Pipeline** - Uses `integration-tests/pipelines/konflux-e2e-tests-pipeline.yaml`
176+
- **Periodic E2E Pipeline** - Uses `integration-tests/pipelines/e2e-tests-periodic-pipeline.yaml`
177+
- Runs all integration suites in one Tekton step
178+
- **Memory**: 6Gi on the `run-test` step (raised from 2Gi to avoid OOM during parallel setup)
179+
- **Concurrency**: at most **9** suites at a time by default (`MAX_PARALLEL` pipeline param, overridable per PipelineRun)
180+
- Component init retries when `kubectl get` is temporarily unavailable under load (`lib/test-functions.sh`)
181+
182+
Local `./run-test.sh` runs one suite at a time; only the periodic pipeline runs many suites in parallel.
176183

177184
## Troubleshooting
178185

@@ -182,7 +189,8 @@ These integration tests are automatically executed in CI/CD pipelines:
182189
2. **Cluster Access** - Ensure KUBECONFIG is properly configured
183190
3. **Secret Errors** - Check vault password file exists and is correct
184191
4. **Resource Conflicts** - Use cleanup scripts to remove stale resources
185-
5. **PaC token unrecognizable error** - The following error:
192+
5. **OOM or `kubectl create` exit 137 in periodic e2e** - Usually too many suites starting at once or insufficient step memory. The periodic pipeline caps parallelism and sets 6Gi; if failures persist, check Tekton step logs for `Killed` during tenant resource setup.
193+
6. **PaC token unrecognizable error** - The following error:
186194
```bash
187195
Initialization check attempt 6/60...
188196
⚠️ Warning: Could not get component PR from annotations: {"pac":{"state":"error","error-id":74,"error-message":"74: Access token is unrecognizable by GitHub"},"message":"done"}

integration-tests/lib/test-functions.sh

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,25 @@ create_kubernetes_resources() {
334334
echo "Kubernetes resources applied."
335335
}
336336

337+
# Fetch build.appstudio.openshift.io/status from a Component (stdout). Returns 1 if kubectl fails.
338+
# Relies on global variable: tenant_namespace
339+
fetch_component_build_status_annotation() {
340+
local comp_name="$1"
341+
local component_json=""
342+
local kubectl_status=0
343+
344+
set +e
345+
component_json=$(kubectl get component/"${comp_name}" -n "${tenant_namespace}" -ojson 2>/dev/null)
346+
kubectl_status=$?
347+
set -e
348+
349+
if [ "${kubectl_status}" -ne 0 ] || [ -z "${component_json}" ]; then
350+
return 1
351+
fi
352+
353+
jq -r --arg k "build.appstudio.openshift.io/status" '.metadata.annotations[$k] // ""' <<< "${component_json}"
354+
}
355+
337356
# Function to wait for component initialization and get PR details
338357
# Modifies global variables: component_pr, pr_number
339358
# Relies on global variables: component_name, tenant_namespace
@@ -348,9 +367,15 @@ wait_for_component_initialization() {
348367
while [ $attempt -le $max_attempts ]; do
349368
echo "Initialization check attempt ${attempt}/${max_attempts}..."
350369

351-
# Try to get component annotations
352-
component_annotations=$(kubectl get component/"${component_name}" -n "${tenant_namespace}" -ojson 2>/dev/null | \
353-
jq -r --arg k "build.appstudio.openshift.io/status" '.metadata.annotations[$k] // ""')
370+
if ! component_annotations=$(fetch_component_build_status_annotation "${component_name}"); then
371+
log_warning "Could not reach component ${component_name} (kubectl get failed); retrying..."
372+
if [ $attempt -lt $max_attempts ]; then
373+
echo "Waiting 10 seconds before retry..."
374+
sleep 10
375+
fi
376+
attempt=$((attempt + 1))
377+
continue
378+
fi
354379

355380
if [ -n "${component_annotations}" ]; then
356381
# component_pr is made global by not declaring it local
@@ -742,9 +767,15 @@ wait_for_single_component_initialization() {
742767
while [ $attempt -le $max_attempts ]; do
743768
echo "Initialization check attempt ${attempt}/${max_attempts} for ${comp_name}..."
744769

745-
# Try to get component annotations
746-
component_annotations=$(kubectl get component/"${comp_name}" -n "${tenant_namespace}" -ojson 2>/dev/null | \
747-
jq -r --arg k "build.appstudio.openshift.io/status" '.metadata.annotations[$k] // ""')
770+
if ! component_annotations=$(fetch_component_build_status_annotation "${comp_name}"); then
771+
echo "⚠️ Could not reach component ${comp_name} (kubectl get failed); retrying..."
772+
if [ $attempt -lt $max_attempts ]; then
773+
echo "Waiting 10 seconds before retry..."
774+
sleep 10
775+
fi
776+
attempt=$((attempt + 1))
777+
continue
778+
fi
748779

749780
if [ -n "${component_annotations}" ]; then
750781
# component_pr is made global by not declaring it local

integration-tests/pipelines/e2e-tests-periodic-pipeline.yaml

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ spec:
1717
- name: KUBECONFIG_SECRET_NAME
1818
default: 'kubeconfig-secret'
1919
type: string
20+
- name: MAX_PARALLEL
21+
default: '9'
22+
type: string
23+
description: Maximum integration test suites to run concurrently in the periodic run-test step
2024
tasks:
2125
- name: get-snapshot-data
2226
params:
@@ -49,6 +53,8 @@ spec:
4953
value: $(params.GITHUB_TOKEN_SECRET_NAME)
5054
- name: KUBECONFIG_SECRET_NAME
5155
value: $(params.KUBECONFIG_SECRET_NAME)
56+
- name: MAX_PARALLEL
57+
value: $(params.MAX_PARALLEL)
5258
taskSpec:
5359
params:
5460
- name: STEP_IMAGE
@@ -61,6 +67,8 @@ spec:
6167
- name: VAULT_PASSWORD_SECRET_NAME
6268
- name: GITHUB_TOKEN_SECRET_NAME
6369
- name: KUBECONFIG_SECRET_NAME
70+
- name: MAX_PARALLEL
71+
type: string
6472
results:
6573
- name: TEST_OUTPUT
6674
description: Test output
@@ -69,9 +77,9 @@ spec:
6977
image: $(params.STEP_IMAGE)
7078
computeResources:
7179
limits:
72-
memory: 2Gi
80+
memory: 6Gi
7381
requests:
74-
memory: 2Gi
82+
memory: 6Gi
7583
env:
7684
- name: VAULT_PASSWORD
7785
valueFrom:
@@ -171,29 +179,53 @@ spec:
171179
export RELEASE_CATALOG_GIT_URL
172180
export RELEASE_CATALOG_GIT_REVISION
173181
174-
# Run all testcases in parallel and collect their statuses
182+
# Cap parallel suites to limit kubectl/API and memory spikes (step memory: 6Gi).
183+
MAX_PARALLEL="$(params.MAX_PARALLEL)"
175184
declare -A pids
176-
# declare -A testcase_names
177-
for testcase in "${ALL_TESTCASES[@]}"; do
178-
echo "Running test case: $testcase"
179-
"/home/e2e/tests/run-test.sh" "$testcase" &
180-
pid=$!
181-
pids["$pid"]=$testcase
182-
# testcase_names["$testcase"]=$pid
183-
done
185+
testcase_index=0
184186
185-
# Wait for all tests to complete, track success and failure
186-
for pid in "${!pids[@]}"; do
187-
testcase="${pids[$pid]}"
188-
if wait "$pid"; then
189-
success_test_cases+=("$testcase")
190-
success_test_count=$((success_test_count + 1))
191-
echo "Test case $testcase succeeded"
192-
else
193-
failed_test_cases+=("$testcase")
194-
failed_test_count=$((failed_test_count + 1))
195-
echo "Test case $testcase failed"
187+
reap_finished_test_jobs() {
188+
local pid testcase wait_status
189+
for pid in "${!pids[@]}"; do
190+
if kill -0 "${pid}" 2>/dev/null; then
191+
continue
192+
fi
193+
testcase="${pids[$pid]}"
194+
set +e
195+
wait "${pid}"
196+
wait_status=$?
197+
set -e
198+
if [ "${wait_status}" -eq 0 ]; then
199+
success_test_cases+=("${testcase}")
200+
success_test_count=$((success_test_count + 1))
201+
echo "Test case ${testcase} succeeded"
202+
else
203+
failed_test_cases+=("${testcase}")
204+
failed_test_count=$((failed_test_count + 1))
205+
echo "Test case ${testcase} failed"
206+
fi
207+
unset 'pids[$pid]'
208+
done
209+
}
210+
211+
while [ "${testcase_index}" -lt "${overall_test_count}" ] || [ "${#pids[@]}" -gt 0 ]; do
212+
while [ "${testcase_index}" -lt "${overall_test_count}" ] && [ "${#pids[@]}" -lt "${MAX_PARALLEL}" ]; do
213+
testcase="${ALL_TESTCASES[$testcase_index]}"
214+
testcase_index=$((testcase_index + 1))
215+
echo "Running test case: ${testcase} (${testcase_index}/${overall_test_count}," \
216+
"max ${MAX_PARALLEL} concurrent)"
217+
"/home/e2e/tests/run-test.sh" "${testcase}" &
218+
pids[$!]="${testcase}"
219+
done
220+
221+
if [ "${#pids[@]}" -eq 0 ]; then
222+
continue
196223
fi
224+
225+
set +e
226+
wait -n 2>/dev/null
227+
set -e
228+
reap_finished_test_jobs
197229
done
198230
if [[ "$failed_test_count" -gt 0 ]]; then
199231
exit 1

0 commit comments

Comments
 (0)