hyzhak · hyzhak · Sep 23, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -22,3 +22,54 @@ jobs:
           docker compose -f docker-compose.yml -f docker-compose.integration.yml up --build --exit-code-from integration-tests integration-tests || status=$?
           docker compose -f docker-compose.yml -f docker-compose.integration.yml down -v
           exit $status
+
+  k8s-smoke:
+    runs-on: ubuntu-latest
+    needs: integration
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Create kind cluster
+        uses: helm/kind-action@v1
+        with:
+          wait: 120s
+          cluster_name: otel-lgtm
+
+      - name: Build demo images for local overlay
+        run: |
+          docker build -t space-app:latest app
+          docker build -t loadgen:latest loadgen
+          kind load docker-image space-app:latest --name otel-lgtm
+          kind load docker-image loadgen:latest --name otel-lgtm
+
+      - name: Deploy observability stack to kind
+        run: |
+          kubectl kustomize k8s/base >/dev/null
+          kubectl kustomize k8s/overlays/local >/dev/null
+          kubectl apply -k k8s/overlays/local
+
+      - name: Wait for deployments to become ready
+        run: |
+          kubectl wait --namespace observability --for=condition=Available deployment --all --timeout=5m
+
+      - name: Smoke test FastAPI service
+        run: |
+          set -euo pipefail
+          kubectl run curl --namespace observability --restart=Never --image=curlimages/curl:8.5.0 --command -- curl -fsS http://space-app:8000/ || {
+            kubectl logs --namespace observability deploy/space-app || true
+            kubectl logs --namespace observability deploy/otelcol || true
+            exit 1
+          }
+          kubectl delete pod curl --namespace observability --wait=true
+
+      - name: Collect diagnostics on failure
+        if: failure()
+        run: |
+          kubectl get pods -n observability
+          kubectl describe pods -n observability
+          kubectl logs -n observability deploy/otelcol
+
+      - name: Tear down stack
+        if: always()
+        run: |
+          kubectl delete -k k8s/overlays/local
diff --git a/Makefile b/Makefile
@@ -1,6 +1,7 @@
 SHELL := /bin/bash
 COMPOSE ?= docker compose
 COMPOSE_FILES := -f docker-compose.yml -f docker-compose.integration.yml
+KUBECTL ?= kubectl
 
 .PHONY: integration-test
 integration-test:
@@ -12,3 +13,19 @@ integration-test:
 
 .PHONY: test
 test: integration-test
+
+.PHONY: k8s-apply-local
+k8s-apply-local:
+	$(KUBECTL) apply -k k8s/overlays/local
+
+.PHONY: k8s-delete-local
+k8s-delete-local:
+	$(KUBECTL) delete -k k8s/overlays/local
+
+.PHONY: k8s-apply-production
+k8s-apply-production:
+	$(KUBECTL) apply -k k8s/overlays/production
+
+.PHONY: k8s-delete-production
+k8s-delete-production:
+	$(KUBECTL) delete -k k8s/overlays/production
diff --git a/README.md b/README.md
@@ -48,12 +48,14 @@ flowchart LR
 
 ## Steps to Run Application
 
-### Prerequisites
+### Docker/Podman Compose
+
+#### Prerequisites
 
 - Docker and Docker Compose (or Podman and Podman Compose)
 - Git
 
-### Steps to Run
+#### Steps to Run
 
 This setup works with both Docker Compose and Podman Compose. Use `docker-compose` or `podman-compose` commands as appropriate for your environment.
 
@@ -120,6 +122,115 @@ This setup works with both Docker Compose and Podman Compose. Use `docker-compos
    podman-compose down
    ```
 
+### Kubernetes via Kustomize
+
+The repository also provides a Kubernetes deployment that mirrors the compose stack. All manifests live under `k8s/` and are structured as a reusable base plus environment-specific overlays.
+
+#### Directory layout
+
+- `k8s/base` – Deployments, Services, PersistentVolumeClaims, ConfigMaps, and the `grafana-admin` Secret that together stand up Grafana, Loki, Tempo, Prometheus, the OpenTelemetry Collector, the FastAPI app, and the load generator.
+- `k8s/base/files` – Checked-in copies of the configuration files used by compose. Keep these files in sync with the originals when you change Loki/Tempo/Prometheus/Grafana settings.
+- `k8s/overlays/local` – Targets local development clusters. It swaps the app/load generator images to the locally built tags and disables image pulls, making it ideal for `kind`, `k3d`, or Minikube.
+- `k8s/overlays/production` – Provides templates for cloud clusters. It adds resource requests/limits, sets a sample storage class, promotes Grafana to a `LoadBalancer` Service, and defines placeholder Ingress objects for TLS termination.
+
+#### Managing Grafana credentials
+
+The base manifest generates a `grafana-admin` Secret with the same admin/admin defaults as compose. Before deploying to a shared environment, replace it:
+
+```bash
+kubectl create secret generic grafana-admin \
+  --namespace observability \
+  --from-literal=GF_SECURITY_ADMIN_USER=your-admin \
+  --from-literal=GF_SECURITY_ADMIN_PASSWORD='strong-password' \
+  --dry-run=client -o yaml | kubectl apply -f -
+```
+
+You can also use `kustomize edit set secret --disable-name-suffix-hash grafana-admin ...` inside an overlay if you prefer the Secret to be managed declaratively.
+
+#### Local clusters (kind, k3d, Minikube)
+
+1. Install `kubectl` and a local Kubernetes distribution (`kind`, `k3d`, or `minikube`).
+2. Build the application images and tag them as expected by the overlay:
+
+   ```bash
+   docker build -t space-app:latest app
+   docker build -t loadgen:latest loadgen
+   ```
+
+3. Load the images into your cluster (examples shown for `kind` and Minikube):
+
+   ```bash
+   kind load docker-image space-app:latest
+   kind load docker-image loadgen:latest
+   # or for Minikube
+   minikube image load space-app:latest
+   minikube image load loadgen:latest
+   ```
+
+4. Apply the manifests:
+
+   ```bash
+   make k8s-apply-local
+   # equivalent to: kubectl apply -k k8s/overlays/local
+   ```
+
+5. Wait for workloads to become ready:
+
+   ```bash
+   kubectl get pods -n observability
+   ```
+
+6. Port-forward to reach the services from your workstation:
+
+   ```bash
+   kubectl port-forward -n observability svc/grafana 3000:3000
+   kubectl port-forward -n observability svc/space-app 8000:8000
+   kubectl port-forward -n observability svc/prometheus 9090:9090
+   ```
+
+7. Tear the stack down when finished:
+
+   ```bash
+   make k8s-delete-local
+   ```
+
+#### Production and cloud clusters (GKE, EKS, AKS, bare metal)
+
+1. Copy `k8s/overlays/production` and adjust it to match your infrastructure:
+   - Update `patches/storage-class.yaml` with the correct `storageClassName` for your cluster.
+   - Swap the annotations in `patches/grafana-service.yaml` for the load balancer you use (AWS, GCP, MetalLB, etc.).
+   - Edit `ingress.yaml` with the hostnames/TLS secrets that your ingress controller expects.
+   - Override the container images to point at the registry where you publish the FastAPI app and load generator (for example via `kustomize edit set image`).
+2. Rotate the Grafana admin credentials as shown above or manage them through your preferred secret store.
+3. Deploy with:
+
+   ```bash
+   make k8s-apply-production
+   # or: kubectl apply -k k8s/overlays/production
+   ```
+
+4. Integrate the overlay with GitOps or CI pipelines as needed. The manifests are compatible with both `kubectl` and Argo CD/Flux.
+
+To clean up the production overlay from a cluster, run `make k8s-delete-production`.
+
+#### Helpful commands
+
+- Preview the rendered manifests before applying:
+
+  ```bash
+  kubectl kustomize k8s/overlays/local | less
+  kubectl kustomize k8s/overlays/production | less
+  ```
+
+- Check the health of the running stack:
+
+  ```bash
+  kubectl get pods,svc,pvc -n observability
+  kubectl logs -n observability deploy/otelcol
+  ```
+
+If you change any of the configuration files under `grafana/`, `otel-collector/`, `tempo/`, `loki/`, or `prometheus/`, copy the edits into `k8s/base/files` to keep the Kubernetes ConfigMaps aligned with the compose setup.
+
 ## Additional Notes
 
 - The load generator service will automatically start generating traffic to the FastAPI application

diff --git a/k8s/base/files/grafana/dashboards/otel_mvp.json b/k8s/base/files/grafana/dashboards/otel_mvp.json
@@ -0,0 +1,51 @@
+{
+  "schemaVersion": 39,
+  "title": "OTel MVP – Logs • Traces • Metrics",
+  "panels": [
+    {
+      "type": "timeseries",
+      "title": "Requests /s",
+      "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "prometheus"},
+          "expr": "sum(rate(app_requests_total[1m]))",
+          "legendFormat": "req/s"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Latency p95 (ms)",
+      "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "prometheus"},
+          "expr": "histogram_quantile(0.95, sum(rate(app_request_duration_ms_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Errors /min",
+      "gridPos": {"x": 0, "y": 8, "w": 6, "h": 6},
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "prometheus"},
+          "expr": "sum(rate(app_request_errors_total[1m])) * 60"
+        }
+      ]
+    },
+    {
+      "type": "logs",
+      "title": "Logs (space-app)",
+      "gridPos": {"x": 6, "y": 8, "w": 18, "h": 12},
+      "datasource": {"type": "loki", "uid": "loki"},
+      "targets": [
+        {"expr": "{service_name=\"space-app\"}"}
+      ]
+    }
+  ],
+  "time": {"from": "now-2h", "to": "now"}
+}
diff --git a/k8s/base/files/grafana/provisioning/alerting/alerts.yml b/k8s/base/files/grafana/provisioning/alerting/alerts.yml
@@ -0,0 +1,50 @@
+apiVersion: 1
+
+contactPoints:
+  - orgId: 1
+    name: console
+    receivers:
+      - uid: console
+        type: webhook
+        settings:
+          url: http://localhost:65535/nowhere
+
+policies:
+  - orgId: 1
+    receiver: console
+
+rules:
+  - orgId: 1
+    name: High error rate (space-app)
+    folder: Alerts
+    interval: 30s
+    condition: B
+    data:
+      - refId: A
+        datasourceUid: prometheus
+        relativeTimeRange:
+          from: 600
+          to: 0
+        model:
+          expr: sum(rate(app_request_errors_total[1m]))
+          interval: ""
+          legendFormat: "errors/s"
+          refId: A
+      - refId: B
+        datasourceUid: __expr__
+        model:
+          type: threshold
+          expression: A
+          conditions:
+            - evaluator:
+                params: [0.0167]
+                type: gt
+              operator:
+                type: and
+              reducer:
+                type: last
+              type: query
+    for: 2m
+    annotations:
+      summary: "Error rate >= 1/min over 2m"
+      description: "space-app is erroring. Check /error endpoint & logs."
diff --git a/k8s/base/files/grafana/provisioning/dashboards/dashboards.yml b/k8s/base/files/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,9 @@
+apiVersion: 1
+providers:
+  - name: otel-mvp
+    orgId: 1
+    type: file
+    disableDeletion: false
+    editable: true
+    options:
+      path: /var/lib/grafana/dashboards
diff --git a/k8s/base/files/grafana/provisioning/datasources/datasources.yml b/k8s/base/files/grafana/provisioning/datasources/datasources.yml
@@ -0,0 +1,37 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    uid: prometheus
+
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    uid: loki
+    jsonData:
+      derivedFields:
+        - name: traceID
+          matcherRegex: "trace_id=(\\w+)"
+          datasourceUid: tempo
+          urlDisplayLabel: "View trace"
+
+  - name: Tempo
+    type: tempo
+    access: proxy
+    url: http://tempo:3200
+    uid: tempo
+    jsonData:
+      nodeGraph:
+        enabled: true
+      tracesToLogsV2:
+        datasourceUid: loki
+        spanStartTimeShift: "-10m"
+        spanEndTimeShift: "10m"
+        filterByTraceID: true
+      useGRPC: false       # 👈 force HTTP
+      # (optional) httpMethod: GET