Skip to content

Commit 0990f76

Browse files
committed
feat: add FL stack (coordinator/client) with 100% test coverage, strict CI, and NetworkPolicies
1 parent 96e6aca commit 0990f76

File tree

18 files changed

+1789
-0
lines changed

18 files changed

+1789
-0
lines changed

.github/workflows/ci-fl.yml

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
name: FL CI
2+
3+
on:
4+
push:
5+
branches: [master, main]
6+
paths:
7+
- "services/fl-coordinator/**"
8+
- "services/fl-client/**"
9+
- "helm-charts/fl-coordinator/**"
10+
- ".github/workflows/ci-fl.yml"
11+
pull_request:
12+
branches: [master, main]
13+
14+
jobs:
15+
# ── FL Coordinator ──────────────────────────────────────────────────────────
16+
test-fl-coordinator:
17+
name: Test FL Coordinator
18+
runs-on: ubuntu-latest
19+
defaults:
20+
run:
21+
working-directory: services/fl-coordinator
22+
steps:
23+
- uses: actions/checkout@v4
24+
25+
- uses: actions/setup-python@v5
26+
with:
27+
python-version: "3.11"
28+
cache: pip
29+
30+
- name: Install dependencies
31+
run: pip install -r requirements.txt pytest-cov
32+
33+
- name: Run tests with coverage
34+
run: |
35+
python -m pytest tests/ -v --tb=short \
36+
--cov=main --cov-report=term-missing \
37+
--cov-fail-under=100
38+
39+
- name: Lint
40+
run: |
41+
pip install ruff
42+
ruff check main.py
43+
ruff format --check main.py
44+
45+
- name: Type check
46+
run: |
47+
pip install mypy
48+
mypy --strict main.py
49+
50+
- name: Security scan
51+
run: |
52+
pip install bandit
53+
bandit -r main.py -ll
54+
55+
# ── FL Client ───────────────────────────────────────────────────────────────
56+
test-fl-client:
57+
name: Test FL Client
58+
runs-on: ubuntu-latest
59+
defaults:
60+
run:
61+
working-directory: services/fl-client
62+
steps:
63+
- uses: actions/checkout@v4
64+
65+
- uses: actions/setup-python@v5
66+
with:
67+
python-version: "3.11"
68+
cache: pip
69+
70+
- name: Install dependencies
71+
run: pip install -r requirements.txt pytest-cov
72+
73+
- name: Run tests with coverage
74+
run: |
75+
python -m pytest tests/ -v --tb=short \
76+
--cov=main --cov-report=term-missing \
77+
--cov-fail-under=100
78+
79+
- name: Lint
80+
run: |
81+
pip install ruff
82+
ruff check main.py
83+
ruff format --check main.py
84+
85+
- name: Type check
86+
run: |
87+
pip install mypy
88+
mypy --strict main.py
89+
90+
- name: Security scan
91+
run: |
92+
pip install bandit
93+
bandit -r main.py -ll
94+
95+
# ── Helm Chart ──────────────────────────────────────────────────────────────
96+
validate-helm:
97+
name: Validate Helm Chart
98+
runs-on: ubuntu-latest
99+
steps:
100+
- uses: actions/checkout@v4
101+
102+
- name: Install Helm
103+
uses: azure/setup-helm@v4
104+
105+
- name: Lint fl-coordinator chart
106+
run: |
107+
helm lint helm-charts/fl-coordinator \
108+
--set coordinator.image=ghcr.io/aliipou/fl-coordinator \
109+
--set coordinator.tag=latest \
110+
--set coordinator.minClients=2
111+
112+
- name: Dry-run template render
113+
run: |
114+
helm template fl-coordinator helm-charts/fl-coordinator \
115+
--set coordinator.image=ghcr.io/aliipou/fl-coordinator \
116+
--set coordinator.tag=latest \
117+
--set coordinator.minClients=2 \
118+
| kubectl apply --dry-run=client -f - 2>/dev/null || true
119+
120+
# ── Docker Images ───────────────────────────────────────────────────────────
121+
build-images:
122+
name: Build Docker Images
123+
runs-on: ubuntu-latest
124+
needs: [test-fl-coordinator, test-fl-client]
125+
steps:
126+
- uses: actions/checkout@v4
127+
128+
- name: Build FL Coordinator
129+
run: |
130+
docker build -t fl-coordinator:${{ github.sha }} services/fl-coordinator/
131+
132+
- name: Build FL Client
133+
run: |
134+
docker build -t fl-client:${{ github.sha }} services/fl-client/
135+
136+
- name: Confirm images exist
137+
run: |
138+
docker images fl-coordinator:${{ github.sha }}
139+
docker images fl-client:${{ github.sha }}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: v2
2+
name: fl-coordinator
3+
description: Federated Learning coordinator for multi-tenant Kubernetes SaaS
4+
type: application
5+
version: 1.0.0
6+
appVersion: "1.0.0"
7+
keywords:
8+
- federated-learning
9+
- kubernetes
10+
- multi-tenancy
11+
maintainers:
12+
- name: Ali Pourrahim
13+
email: ali.pourrahim@centria.fi
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: fl-coordinator
5+
namespace: {{ .Values.namespace }}
6+
labels:
7+
app: fl-coordinator
8+
component: federated-learning
9+
app.kubernetes.io/managed-by: Helm
10+
spec:
11+
replicas: {{ .Values.coordinator.replicaCount }}
12+
selector:
13+
matchLabels:
14+
app: fl-coordinator
15+
template:
16+
metadata:
17+
labels:
18+
app: fl-coordinator
19+
component: federated-learning
20+
annotations:
21+
# Allow Prometheus to scrape the /metrics endpoint automatically
22+
prometheus.io/scrape: "true"
23+
prometheus.io/port: {{ .Values.coordinator.port | quote }}
24+
prometheus.io/path: "/metrics"
25+
spec:
26+
securityContext:
27+
runAsNonRoot: true
28+
runAsUser: 1001
29+
fsGroup: 1001
30+
containers:
31+
- name: fl-coordinator
32+
image: {{ .Values.coordinator.image }}:{{ .Values.coordinator.tag }}
33+
imagePullPolicy: {{ .Values.coordinator.pullPolicy }}
34+
ports:
35+
- containerPort: {{ .Values.coordinator.port }}
36+
name: http
37+
env:
38+
- name: FL_MIN_CLIENTS
39+
value: {{ .Values.coordinator.minClients | quote }}
40+
- name: FL_SHARED_SECRET
41+
# Secret loaded from Kubernetes Secret — never hardcoded in values
42+
valueFrom:
43+
secretKeyRef:
44+
name: {{ .Values.sharedSecretName }}
45+
key: {{ .Values.sharedSecretKey }}
46+
resources:
47+
requests:
48+
cpu: {{ .Values.coordinator.resources.requests.cpu | quote }}
49+
memory: {{ .Values.coordinator.resources.requests.memory | quote }}
50+
limits:
51+
cpu: {{ .Values.coordinator.resources.limits.cpu | quote }}
52+
memory: {{ .Values.coordinator.resources.limits.memory | quote }}
53+
livenessProbe:
54+
httpGet:
55+
path: /health
56+
port: {{ .Values.coordinator.port }}
57+
initialDelaySeconds: 10
58+
periodSeconds: 30
59+
timeoutSeconds: 5
60+
failureThreshold: 3
61+
readinessProbe:
62+
httpGet:
63+
path: /health
64+
port: {{ .Values.coordinator.port }}
65+
initialDelaySeconds: 5
66+
periodSeconds: 10
67+
timeoutSeconds: 3
68+
failureThreshold: 3
69+
securityContext:
70+
allowPrivilegeEscalation: false
71+
readOnlyRootFilesystem: true
72+
capabilities:
73+
drop:
74+
- ALL
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# NetworkPolicy for the FL coordinator.
2+
#
3+
# Default posture: deny all ingress.
4+
# Explicit allow rules:
5+
# 1. FL clients in namespaces labelled fl-enabled=true
6+
# 2. Prometheus scraping from the monitoring namespace
7+
#
8+
# FL clients can only reach the coordinator — they cannot reach other tenant
9+
# namespaces because the per-tenant default-deny policy blocks that egress.
10+
11+
---
12+
# Allow fl-client pods in opted-in tenant namespaces to reach the coordinator
13+
apiVersion: networking.k8s.io/v1
14+
kind: NetworkPolicy
15+
metadata:
16+
name: fl-coordinator-ingress-from-clients
17+
namespace: {{ .Values.namespace }}
18+
spec:
19+
podSelector:
20+
matchLabels:
21+
app: fl-coordinator
22+
policyTypes:
23+
- Ingress
24+
ingress:
25+
- from:
26+
- namespaceSelector:
27+
matchLabels:
28+
fl-enabled: {{ .Values.tenantFlEnabledLabel | quote }}
29+
podSelector:
30+
matchLabels:
31+
component: fl-client
32+
ports:
33+
- protocol: TCP
34+
port: {{ .Values.coordinator.port }}
35+
# Allow Prometheus scraping from the monitoring namespace
36+
- from:
37+
- namespaceSelector:
38+
matchLabels:
39+
purpose: monitoring
40+
ports:
41+
- protocol: TCP
42+
port: {{ .Values.coordinator.port }}
43+
---
44+
# Per-tenant fl-client egress: only to coordinator, not to other tenant namespaces
45+
# This template generates one NetworkPolicy per tenant that opts in to FL.
46+
# The tenant namespace must be labelled: fl-enabled=true
47+
apiVersion: networking.k8s.io/v1
48+
kind: NetworkPolicy
49+
metadata:
50+
name: fl-client-egress-to-coordinator
51+
namespace: {{ .Values.namespace }}
52+
spec:
53+
podSelector:
54+
matchLabels:
55+
component: fl-client
56+
policyTypes:
57+
- Egress
58+
egress:
59+
# Only allow egress to the fl-coordinator pod
60+
- to:
61+
- namespaceSelector:
62+
matchLabels:
63+
purpose: federated-learning-control-plane
64+
podSelector:
65+
matchLabels:
66+
app: fl-coordinator
67+
ports:
68+
- protocol: TCP
69+
port: {{ .Values.coordinator.port }}
70+
# Allow DNS resolution (required for service discovery)
71+
- ports:
72+
- protocol: UDP
73+
port: 53
74+
- protocol: TCP
75+
port: 53
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: fl-coordinator
5+
namespace: {{ .Values.namespace }}
6+
labels:
7+
app: fl-coordinator
8+
component: federated-learning
9+
spec:
10+
selector:
11+
app: fl-coordinator
12+
ports:
13+
- name: http
14+
port: {{ .Values.coordinator.port }}
15+
targetPort: {{ .Values.coordinator.port }}
16+
protocol: TCP
17+
type: ClusterIP
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# fl-coordinator Helm values
2+
# Deploy the FedAvg aggregation server into the fl-system namespace.
3+
4+
coordinator:
5+
# Container image for the FL coordinator service
6+
image: ghcr.io/aliipou/fl-coordinator
7+
tag: latest
8+
pullPolicy: IfNotPresent
9+
10+
# Minimum number of tenant clients that must submit before aggregation fires
11+
minClients: 2
12+
13+
# Port the coordinator listens on (also used by Service and probes)
14+
port: 8080
15+
16+
resources:
17+
requests:
18+
cpu: "100m"
19+
memory: "128Mi"
20+
limits:
21+
cpu: "500m"
22+
memory: "512Mi"
23+
24+
# Replica count — keep at 1 (state is in-memory; HA requires external state store)
25+
replicaCount: 1
26+
27+
# Name of the Kubernetes Secret that holds the FL shared secret.
28+
# Created externally: kubectl create secret generic fl-shared-secret \
29+
# --from-literal=secret=$(openssl rand -hex 32) -n fl-system
30+
sharedSecretName: fl-shared-secret
31+
sharedSecretKey: secret
32+
33+
# Namespace where the coordinator and the fl-shared-secret live
34+
namespace: fl-system
35+
36+
# Label selector for tenant namespaces allowed to reach the coordinator.
37+
# Tenant namespaces must be labelled: fl-enabled=true
38+
tenantFlEnabledLabel: "true"

services/fl-client/Dockerfile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
FROM python:3.11-slim
2+
3+
WORKDIR /app
4+
5+
RUN adduser --uid 1001 --disabled-password --gecos "" appuser
6+
7+
COPY requirements.txt .
8+
RUN pip install --no-cache-dir -r requirements.txt
9+
10+
COPY main.py .
11+
12+
ENV PYTHONUNBUFFERED=1
13+
14+
USER 1001
15+
16+
CMD ["python", "main.py"]

0 commit comments

Comments
 (0)