Skip to content

Commit 7bc4f0b

Browse files
author
Nissan Pow
committed
ci: reduce resource pressure on GHA runners
- Lower minikube memory from 6GB to 4GB (leaves 3GB for host processes) - Reduce parallel workers: argo 2→1, airflow 2→1, sfn 4→2 - Add pytest-rerunfailures with --reruns 1 for transient infra flakes - Add background pod cleanup loop during argo/airflow test runs
1 parent 4e77e43 commit 7bc4f0b

1 file changed

Lines changed: 14 additions & 6 deletions

File tree

.github/workflows/ux-tests.yml

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,13 @@ jobs:
102102
workers: 4
103103
- backend: argo-kubernetes
104104
services: "minio,postgresql,metadata-service,argo-workflows"
105-
workers: 2
105+
workers: 1
106106
- backend: airflow-kubernetes
107107
services: "minio,postgresql,metadata-service,airflow"
108-
workers: 2
108+
workers: 1
109109
- backend: sfn-batch
110110
services: "minio,postgresql,metadata-service,localbatch,ddb-local,sfn-local"
111-
workers: 4
111+
workers: 2
112112

113113
runs-on: ubuntu-latest
114114

@@ -130,15 +130,15 @@ jobs:
130130
- name: Install Metaflow and test dependencies
131131
run: |
132132
pip install --upgrade pip
133-
pip install -e ".[dev]" pytest-xdist pytest-timeout pytest-cov
133+
pip install -e ".[dev]" pytest-xdist pytest-timeout pytest-cov pytest-rerunfailures
134134
pip install "git+https://github.com/npow/localbatch.git@main#egg=localbatch"
135135
136136
- name: Set up minikube
137137
uses: medyagh/setup-minikube@aba8d5ff1666c72adf94ccd078b2ca12e7756382
138138
with:
139139
driver: docker
140140
cpus: 2
141-
memory: 6144
141+
memory: 4096
142142

143143
- name: Restore minikube image cache
144144
id: image-cache
@@ -236,11 +236,17 @@ jobs:
236236
if: matrix.backend == 'airflow-kubernetes'
237237
run: devtools/ci/wait-airflow-api.sh
238238

239-
- name: Clean up completed pods (argo/airflow only)
239+
- name: Clean up completed pods and start background cleanup
240240
if: matrix.backend == 'argo-kubernetes' || matrix.backend == 'airflow-kubernetes'
241241
run: |
242242
kubectl delete pods --field-selector=status.phase=Succeeded --all-namespaces 2>/dev/null || true
243243
kubectl delete pods --field-selector=status.phase=Failed --all-namespaces 2>/dev/null || true
244+
# Periodically clean up completed pods during test runs to free cluster resources
245+
while true; do
246+
sleep 60
247+
kubectl delete pods --field-selector=status.phase=Succeeded --all-namespaces 2>/dev/null || true
248+
kubectl delete pods --field-selector=status.phase=Failed --all-namespaces 2>/dev/null || true
249+
done &
244250
245251
- name: Run UX tests — ${{ matrix.backend }}
246252
run: |
@@ -253,6 +259,8 @@ jobs:
253259
-v \
254260
--tb=short \
255261
--timeout=1800 \
262+
--reruns 1 \
263+
--reruns-delay 10 \
256264
--cov=metaflow \
257265
--cov-report=term-missing \
258266
--cov-report=xml:coverage.xml \

0 commit comments

Comments
 (0)