diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index e926cd3..c4b1384 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -22,3 +22,54 @@ jobs: docker compose -f docker-compose.yml -f docker-compose.integration.yml up --build --exit-code-from integration-tests integration-tests || status=$? docker compose -f docker-compose.yml -f docker-compose.integration.yml down -v exit $status + + k8s-smoke: + runs-on: ubuntu-latest + needs: integration + steps: + - uses: actions/checkout@v4 + + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + wait: 120s + cluster_name: otel-lgtm + + - name: Build demo images for local overlay + run: | + docker build -t space-app:latest app + docker build -t loadgen:latest loadgen + kind load docker-image space-app:latest --name otel-lgtm + kind load docker-image loadgen:latest --name otel-lgtm + + - name: Deploy observability stack to kind + run: | + kubectl kustomize k8s/base >/dev/null + kubectl kustomize k8s/overlays/local >/dev/null + kubectl apply -k k8s/overlays/local + + - name: Wait for deployments to become ready + run: | + kubectl wait --namespace observability --for=condition=Available deployment --all --timeout=5m + + - name: Smoke test FastAPI service + run: | + set -euo pipefail + kubectl run curl --namespace observability --restart=Never --image=curlimages/curl:8.5.0 --command -- curl -fsS http://space-app:8000/ || { + kubectl logs --namespace observability deploy/space-app || true + kubectl logs --namespace observability deploy/otelcol || true + exit 1 + } + kubectl delete pod curl --namespace observability --wait=true + + - name: Collect diagnostics on failure + if: failure() + run: | + kubectl get pods -n observability + kubectl describe pods -n observability + kubectl logs -n observability deploy/otelcol + + - name: Tear down stack + if: always() + run: | + kubectl delete -k k8s/overlays/local diff --git a/Makefile b/Makefile index 89d64c2..d77a7d9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ SHELL := /bin/bash COMPOSE ?= docker compose COMPOSE_FILES := -f docker-compose.yml -f docker-compose.integration.yml +KUBECTL ?= kubectl .PHONY: integration-test integration-test: @@ -12,3 +13,19 @@ integration-test: .PHONY: test test: integration-test + +.PHONY: k8s-apply-local +k8s-apply-local: + $(KUBECTL) apply -k k8s/overlays/local + +.PHONY: k8s-delete-local +k8s-delete-local: + $(KUBECTL) delete -k k8s/overlays/local + +.PHONY: k8s-apply-production +k8s-apply-production: + $(KUBECTL) apply -k k8s/overlays/production + +.PHONY: k8s-delete-production +k8s-delete-production: + $(KUBECTL) delete -k k8s/overlays/production diff --git a/README.md b/README.md index 8a3af2c..b4ddf48 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,14 @@ flowchart LR ## Steps to Run Application -### Prerequisites +### Docker/Podman Compose + +#### Prerequisites - Docker and Docker Compose (or Podman and Podman Compose) - Git -### Steps to Run +#### Steps to Run This setup works with both Docker Compose and Podman Compose. Use `docker-compose` or `podman-compose` commands as appropriate for your environment. @@ -120,6 +122,199 @@ This setup works with both Docker Compose and Podman Compose. Use `docker-compos podman-compose down ``` +### Kubernetes via Kustomize + +The repository also provides a Kubernetes deployment that mirrors the compose stack. All manifests live under `k8s/` and are structured as a reusable base plus environment-specific overlays. + +#### Directory layout + +- `k8s/base` – Deployments, Services, PersistentVolumeClaims, ConfigMaps, and the `grafana-admin` Secret that together stand up Grafana, Loki, Tempo, Prometheus, the OpenTelemetry Collector, the FastAPI app, and the load generator. +- `k8s/base/files` – Checked-in copies of the configuration files used by compose. Keep these files in sync with the originals when you change Loki/Tempo/Prometheus/Grafana settings. +- `k8s/overlays/local` – Targets local development clusters. It swaps the app/load generator images to the locally built tags and disables image pulls, making it ideal for `kind`, `k3d`, or Minikube. +- `k8s/overlays/production` – Provides templates for cloud clusters. It adds resource requests/limits, sets a sample storage class, promotes Grafana to a `LoadBalancer` Service, and defines placeholder Ingress objects for TLS termination. +- `docs/k8s-manifests.md` – Deep dive into every manifest with links back to the official Kubernetes documentation for further reading. + +#### Managing Grafana credentials + +The base manifest generates a `grafana-admin` Secret with the same admin/admin defaults as compose. Before deploying to a shared environment, replace it: + +```bash +kubectl create secret generic grafana-admin \ + --namespace observability \ + --from-literal=GF_SECURITY_ADMIN_USER=your-admin \ + --from-literal=GF_SECURITY_ADMIN_PASSWORD='strong-password' \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +You can also use `kustomize edit set secret --disable-name-suffix-hash grafana-admin ...` inside an overlay if you prefer the Secret to be managed declaratively. + +#### macOS quickstart (kind + Docker Desktop or Podman) + +These steps were tested end-to-end on a macOS host using `kind` v0.26.0 and Podman 5.5.2. Substitute Docker Desktop if that is your preferred container runtime. + +1. **Install prerequisites** + - [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-macos/) for cluster interaction. + - [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) to provision a local Kubernetes cluster in containers. + - Either [Docker Desktop](https://docs.docker.com/desktop/install/mac-install/) or [Podman Desktop](https://podman.io/docs/installation#macos) as the container engine. When using Podman, make sure `podman machine` is running (`podman machine start`). + +2. **Clone the repository and move into it** + + ```bash + git clone https://github.com/hyzhak/otel-lgtm-mvp.git + cd otel-lgtm-mvp + ``` + +3. **Build the demo images** + + ```bash + # Docker Desktop + docker build -t space-app:latest app + docker build -t loadgen:latest loadgen + + # Podman (tested) + podman build -t space-app:latest app + podman build -t loadgen:latest loadgen + ``` + +4. **Create the kind cluster** + + ```bash + kind create cluster --name otel-lgtm --wait 2m + ``` + + `kind` automatically detects Docker, Podman, or Nerdctl. If you want to force a specific runtime set `KIND_EXPERIMENTAL_PROVIDER=docker|podman|nerdctl` before running the command (see the [kind quick-start guide](https://kind.sigs.k8s.io/docs/user/quick-start/)). + +5. **Load the local images into the cluster** + - When Docker is the active runtime, `kind load docker-image` works directly: + + ```bash + kind load docker-image space-app:latest --name otel-lgtm + kind load docker-image loadgen:latest --name otel-lgtm + ``` + + - With Podman rootless, push-style loading is not yet implemented, so tag the images for the Docker registry namespace and import an archive (workaround documented in the [kind Podman guide](https://kind.sigs.k8s.io/docs/user/rootless/)): + + ```bash + podman tag space-app:latest docker.io/library/space-app:latest + podman tag loadgen:latest docker.io/library/loadgen:latest + podman save --format docker-archive -o space-app.tar docker.io/library/space-app:latest + podman save --format docker-archive -o loadgen.tar docker.io/library/loadgen:latest + KIND_EXPERIMENTAL_PROVIDER=podman kind load image-archive space-app.tar --name otel-lgtm + KIND_EXPERIMENTAL_PROVIDER=podman kind load image-archive loadgen.tar --name otel-lgtm + ``` + +6. **Deploy the stack** + + ```bash + make k8s-apply-local + kubectl wait --namespace observability --for=condition=Available deployment --all --timeout=5m + ``` + +7. **Access the services** + - Forward ports from the cluster and open the dashboards locally: + + ```bash + kubectl port-forward -n observability svc/grafana 3000:3000 + kubectl port-forward -n observability svc/space-app 8000:8000 + ``` + + - Visit `http://localhost:3000` (Grafana) and `http://localhost:8000` (FastAPI). You can also run `open http://localhost:3000` on macOS. + +8. **Clean up** + + ```bash + make k8s-delete-local + # Docker Desktop + kind delete cluster --name otel-lgtm + # Podman provider + KIND_EXPERIMENTAL_PROVIDER=podman kind delete cluster --name otel-lgtm + rm -f space-app.tar loadgen.tar # remove the temporary archives if you created them + ``` + +#### Local clusters (kind, k3d, Minikube) + +1. Install `kubectl` and a local Kubernetes distribution (`kind`, `k3d`, or `minikube`). +2. Build the application images and tag them as expected by the overlay: + + ```bash + docker build -t space-app:latest app + docker build -t loadgen:latest loadgen + ``` + +3. Load the images into your cluster (examples shown for `kind` and Minikube): + + ```bash + kind load docker-image space-app:latest + kind load docker-image loadgen:latest + # or for Minikube + minikube image load space-app:latest + minikube image load loadgen:latest + ``` + +4. Apply the manifests: + + ```bash + make k8s-apply-local + # equivalent to: kubectl apply -k k8s/overlays/local + ``` + +5. Wait for workloads to become ready: + + ```bash + kubectl get pods -n observability + ``` + +6. Port-forward to reach the services from your workstation: + + ```bash + kubectl port-forward -n observability svc/grafana 3000:3000 + kubectl port-forward -n observability svc/space-app 8000:8000 + kubectl port-forward -n observability svc/prometheus 9090:9090 + ``` + +7. Tear the stack down when finished: + + ```bash + make k8s-delete-local + ``` + +#### Production and cloud clusters (GKE, EKS, AKS, bare metal) + +1. Copy `k8s/overlays/production` and adjust it to match your infrastructure: + - Update `patches/storage-class.yaml` with the correct `storageClassName` for your cluster. + - Swap the annotations in `patches/grafana-service.yaml` for the load balancer you use (AWS, GCP, MetalLB, etc.). + - Edit `ingress.yaml` with the hostnames/TLS secrets that your ingress controller expects. + - Override the container images to point at the registry where you publish the FastAPI app and load generator (for example via `kustomize edit set image`). +2. Rotate the Grafana admin credentials as shown above or manage them through your preferred secret store. +3. Deploy with: + + ```bash + make k8s-apply-production + # or: kubectl apply -k k8s/overlays/production + ``` + +4. Integrate the overlay with GitOps or CI pipelines as needed. The manifests are compatible with both `kubectl` and Argo CD/Flux. + +To clean up the production overlay from a cluster, run `make k8s-delete-production`. + +#### Helpful commands + +- Preview the rendered manifests before applying: + + ```bash + kubectl kustomize k8s/overlays/local | less + kubectl kustomize k8s/overlays/production | less + ``` + +- Check the health of the running stack: + + ```bash + kubectl get pods,svc,pvc -n observability + kubectl logs -n observability deploy/otelcol + ``` + +If you change any of the configuration files under `grafana/`, `otel-collector/`, `tempo/`, `loki/`, or `prometheus/`, copy the edits into `k8s/base/files` to keep the Kubernetes ConfigMaps aligned with the compose setup. + ## Additional Notes - The load generator service will automatically start generating traffic to the FastAPI application diff --git a/docs/k8s-manifests.md b/docs/k8s-manifests.md new file mode 100644 index 0000000..6783bed --- /dev/null +++ b/docs/k8s-manifests.md @@ -0,0 +1,111 @@ +# Kubernetes Manifests Reference + +This document explains how the manifests under `k8s/` assemble the OpenTelemetry demo stack, with links to the relevant official documentation for each Kubernetes feature that is used. If you are new to Kubernetes, use the linked resources to dive deeper into the concepts before modifying the manifests. + +## Layout overview + +| Path | Purpose | Key docs | +| ---- | ------- | -------- | +| `k8s/base/kustomization.yaml` | Defines the reusable base using [Kustomize](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/kustomization/). | [Kustomize overview](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/kustomization/) | +| `k8s/base/*.yaml` | Deployments, Services, and PersistentVolumeClaims for Grafana, Loki, Tempo, Prometheus, the OpenTelemetry Collector, the FastAPI app, and the load generator. | [Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/), [Services](https://kubernetes.io/docs/concepts/services-networking/service/), [PersistentVolumeClaims](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) | +| `k8s/base/files/` | ConfigMap content checked into source control so the Kubernetes stack mirrors docker-compose. | [ConfigMaps](https://kubernetes.io/docs/concepts/configuration/configmap/) | +| `k8s/overlays/local` | Development overlay that replaces container pull behaviour for locally built images. | [Image pull policy](https://kubernetes.io/docs/concepts/containers/images/#image-pull-policy) | +| `k8s/overlays/production` | Production-oriented overlay that adds storage classes, resource limits, LoadBalancer Services, and Ingress resources. | [Storage classes](https://kubernetes.io/docs/concepts/storage/storage-classes/), [Resource requests & limits](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/), [Services type LoadBalancer](https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer), [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) | + +## Namespace + +`k8s/base/namespace.yaml` creates the `observability` namespace and labels it with `app.kubernetes.io/part-of=otel-lgtm-mvp`. Namespaces logically separate workloads inside a cluster—see the [Namespaces documentation](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/) for more background. + +## Configuration data + +`k8s/base/kustomization.yaml` uses `configMapGenerator` to capture the Grafana provisioning bundles, Prometheus scrape configuration, Loki configuration, Tempo configuration, and OpenTelemetry Collector pipeline under version control. The `grafana-admin` Secret is created with `secretGenerator` so credentials can be overridden easily. Review: + +- [ConfigMap design](https://kubernetes.io/docs/concepts/configuration/configmap/) +- [Secret management](https://kubernetes.io/docs/concepts/configuration/secret/) + +The generated ConfigMaps are mounted read-only inside pods to prevent accidental drift from the source repository. + +## Persistent storage + +Four components persist state: Grafana dashboards, Loki data, Tempo data, and Prometheus time series. Each component declares a `PersistentVolumeClaim` with `ReadWriteOnce` access (suitable for single-node clusters) and a modest storage request. See the [PersistentVolume documentation](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) for details on how the cluster satisfies these claims. Production overlays can override `storageClassName` to match the storage backend provided by your cloud or on-premises installation. + +## Workloads and services + +Each component is deployed via a `Deployment` and fronted by a `ClusterIP` Service for stable in-cluster discovery. + +### Grafana (`k8s/base/grafana.yaml`) + +- Deployment runs `grafana/grafana:12.1.1` with HTTP health probes (`/api/health`), referencing [container probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/). +- Mounts ConfigMaps for provisioning and dashboards as read-only volumes and attaches the `grafana-storage` PersistentVolumeClaim for stateful data. +- Environment variables mirror the docker-compose `.env` defaults. Credentials are sourced from the `grafana-admin` Secret. +- Exposes port `3000` via a ClusterIP Service. See [Service basics](https://kubernetes.io/docs/concepts/services-networking/service/#defining-a-service). + +### Loki (`k8s/base/loki.yaml`) + +- Single-replica Deployment using `grafana/loki:3.5.0` with ConfigMap-backed configuration and a PVC for object storage substitutes. +- The Service exposes Loki on port `3100`. Loki uses local filesystem storage, matching the compose demo; consult the [Loki documentation](https://grafana.com/docs/loki/latest/) when adjusting the config file under `k8s/base/files/loki/`. + +### Tempo (`k8s/base/tempo.yaml`) + +- Runs `grafana/tempo:2.8.2` with HTTP ingestion on port `3200` and persistent storage at `/var/tempo`. +- The metrics generator includes `external_labels` to tag exported metrics with `cluster: demo` so dashboards can distinguish environments. +- For configuration details, reference the [Tempo documentation](https://grafana.com/docs/tempo/latest/). + +### Prometheus (`k8s/base/prometheus.yaml`) + +- Deploys `prom/prometheus:v2.53.5` with a single scrape config aimed at the OpenTelemetry Collector and a one-day retention window. +- Stores data in the `prom-data` PVC. +- See the [Prometheus Helm chart values](https://prometheus.io/docs/prometheus/latest/getting_started/) and Kubernetes [Prometheus operator docs](https://github.com/prometheus-operator/prometheus-operator) for further tuning ideas. + +### OpenTelemetry Collector (`k8s/base/otelcol.yaml`) + +- Uses `otel/opentelemetry-collector-contrib:0.133.0` to receive OTLP traffic on gRPC/HTTP and expose Prometheus metrics on port `8889`. +- The configuration shipped in `k8s/base/files/otel-collector/otelcol-config.yml` matches the docker-compose example. Review the [OpenTelemetry Collector configuration guide](https://opentelemetry.io/docs/collector/configuration/) prior to editing pipelines. + +### Demo application (`k8s/base/space-app.yaml`) + +- Deployment references a published container image (`ghcr.io/hyzhak/otel-lgtm-mvp/space-app:latest`) by default, with probes hitting `/` to check health. +- Resource attributes define the service namespace/version, mirroring the compose stack. The local overlay swaps the imagePullPolicy to `Never` so locally built images can be used without pushing to a registry. + +### Load generator (`k8s/base/loadgen.yaml`) + +- Runs a slim Python container that continuously exercises the demo API. Resource requests keep CPU and memory footprint low. +- For resource sizing background, see [Managing Resources for Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). + +## Overlays + +### Local overlay (`k8s/overlays/local`) + +- Inherits the base resources and replaces `imagePullPolicy` with `Never` for the app and load generator, so you can build images locally and run them without a registry. +- The overlay also rewrites the image names to bare tags, matching the names used in `docker build` / `podman build` (configured via the `images` section of the kustomization file). + +### Production overlay (`k8s/overlays/production`) + +- Adds `storageClassName: gp3` (edit to match your storage provisioner) for all PVCs. See [Storage classes](https://kubernetes.io/docs/concepts/storage/storage-classes/) to choose the correct value for your environment. +- Patches Deployments with resource requests/limits to aid scheduling and enforce quotas. Review the [requests and limits guide](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). +- Promotes Grafana to a `LoadBalancer` Service using AWS annotations as an example; adapt those annotations to your cloud provider (refer to the [Service type LoadBalancer](https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer) documentation). +- `ingress.yaml` defines two Ingress resources with placeholder hostnames and TLS secrets. Replace them with your actual DNS names as documented in the [Ingress basics](https://kubernetes.io/docs/concepts/services-networking/ingress/) guide. + +## Supporting configuration files + +The files under `k8s/base/files/` are copied directly from the docker-compose stack: + +- Grafana provisioning (`grafana/provisioning/*`) – see the [Grafana provisioning reference](https://grafana.com/docs/grafana/latest/administration/provisioning/). +- Grafana dashboards (`grafana/dashboards/otel_mvp.json`). +- Loki configuration (`loki/loki-config.yml`) – consult the [Loki configuration reference](https://grafana.com/docs/loki/latest/configuration/). +- Tempo configuration (`tempo/tempo-config.yml`). +- Prometheus scrape config (`prometheus/prometheus.yml`) – refer to the [Prometheus configuration docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/). +- OpenTelemetry Collector pipeline (`otel-collector/otelcol-config.yml`). + +Keeping these files in one place guarantees that the Kubernetes deployment and docker-compose stack share identical settings. + +## Helper commands + +The `Makefile` exposes the most common commands: + +- `make k8s-apply-local` / `make k8s-delete-local` +- `make k8s-apply-production` / `make k8s-delete-production` + +Each target wraps the corresponding `kubectl apply -k` or `kubectl delete -k` command described in the [kustomize CLI reference](https://kubectl.docs.kubernetes.io/references/kustomize/kustomize/). + +Use `kubectl kustomize k8s/overlays/` to preview changes before applying them, and remember to update both the docker-compose files and `k8s/base/files` when you modify service configurations. diff --git a/k8s/base/files/grafana/dashboards/otel_mvp.json b/k8s/base/files/grafana/dashboards/otel_mvp.json new file mode 100644 index 0000000..dd32963 --- /dev/null +++ b/k8s/base/files/grafana/dashboards/otel_mvp.json @@ -0,0 +1,51 @@ +{ + "schemaVersion": 39, + "title": "OTel MVP – Logs • Traces • Metrics", + "panels": [ + { + "type": "timeseries", + "title": "Requests /s", + "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "sum(rate(app_requests_total[1m]))", + "legendFormat": "req/s" + } + ] + }, + { + "type": "timeseries", + "title": "Latency p95 (ms)", + "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "histogram_quantile(0.95, sum(rate(app_request_duration_ms_bucket[5m])) by (le))", + "legendFormat": "p95" + } + ] + }, + { + "type": "stat", + "title": "Errors /min", + "gridPos": {"x": 0, "y": 8, "w": 6, "h": 6}, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "sum(rate(app_request_errors_total[1m])) * 60" + } + ] + }, + { + "type": "logs", + "title": "Logs (space-app)", + "gridPos": {"x": 6, "y": 8, "w": 18, "h": 12}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + {"expr": "{service_name=\"space-app\"}"} + ] + } + ], + "time": {"from": "now-2h", "to": "now"} +} \ No newline at end of file diff --git a/k8s/base/files/grafana/provisioning/alerting/alerts.yml b/k8s/base/files/grafana/provisioning/alerting/alerts.yml new file mode 100644 index 0000000..8014559 --- /dev/null +++ b/k8s/base/files/grafana/provisioning/alerting/alerts.yml @@ -0,0 +1,50 @@ +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: console + receivers: + - uid: console + type: webhook + settings: + url: http://localhost:65535/nowhere + +policies: + - orgId: 1 + receiver: console + +rules: + - orgId: 1 + name: High error rate (space-app) + folder: Alerts + interval: 30s + condition: B + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 600 + to: 0 + model: + expr: sum(rate(app_request_errors_total[1m])) + interval: "" + legendFormat: "errors/s" + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: threshold + expression: A + conditions: + - evaluator: + params: [0.0167] + type: gt + operator: + type: and + reducer: + type: last + type: query + for: 2m + annotations: + summary: "Error rate >= 1/min over 2m" + description: "space-app is erroring. Check /error endpoint & logs." \ No newline at end of file diff --git a/k8s/base/files/grafana/provisioning/dashboards/dashboards.yml b/k8s/base/files/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..833cef6 --- /dev/null +++ b/k8s/base/files/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,9 @@ +apiVersion: 1 +providers: + - name: otel-mvp + orgId: 1 + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/k8s/base/files/grafana/provisioning/datasources/datasources.yml b/k8s/base/files/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..650bacc --- /dev/null +++ b/k8s/base/files/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,37 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + uid: prometheus + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + uid: loki + jsonData: + derivedFields: + - name: traceID + matcherRegex: "trace_id=(\\w+)" + datasourceUid: tempo + urlDisplayLabel: "View trace" + + - name: Tempo + type: tempo + access: proxy + url: http://tempo:3200 + uid: tempo + jsonData: + nodeGraph: + enabled: true + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: "-10m" + spanEndTimeShift: "10m" + filterByTraceID: true + useGRPC: false # 👈 force HTTP + # (optional) httpMethod: GET diff --git a/k8s/base/files/loki/loki-config.yml b/k8s/base/files/loki/loki-config.yml new file mode 100644 index 0000000..5993ec9 --- /dev/null +++ b/k8s/base/files/loki/loki-config.yml @@ -0,0 +1,49 @@ +# Minimal single-binary Loki 3.x for local filesystem storage + TSDB index + +auth_enabled: false + +common: + path_prefix: /loki # satisfies TSDB index requirements + +server: + http_listen_port: 3100 + grpc_listen_port: 0 + +ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 1h + chunk_target_size: 1536000 + # NOTE: removed: max_transfer_retries (deprecated/invalid) + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/chunks + tsdb_shipper: + active_index_directory: /loki/tsdb-shipper-active + cache_location: /loki/tsdb-shipper-cache + # NOTE: do not set shared_store here; not needed for filesystem + +limits_config: + allow_structured_metadata: true + volume_enabled: true # 👈 required for Log Volume UI to work + +compactor: + working_directory: /loki/compactor + # NOTE: removed: shared_store (deprecated/invalid) + compactor_ring: + kvstore: + store: inmemory diff --git a/k8s/base/files/otel-collector/otelcol-config.yml b/k8s/base/files/otel-collector/otelcol-config.yml new file mode 100644 index 0000000..4976d00 --- /dev/null +++ b/k8s/base/files/otel-collector/otelcol-config.yml @@ -0,0 +1,51 @@ +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + memory_limiter: + check_interval: 5s + limit_percentage: 75 + spike_limit_percentage: 15 + batch: + timeout: 2s + send_batch_size: 8192 + resource: + attributes: + - key: service.namespace + action: upsert + value: demo + +exporters: + prometheus: + endpoint: 0.0.0.0:8889 + const_labels: + pipeline: otel-to-prom + + otlphttp/tempo: + endpoint: http://tempo:4318 + + otlphttp/loki: + endpoint: http://loki:3100/otlp + + debug: + verbosity: basic + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch, resource] + exporters: [otlphttp/tempo] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch, resource] + exporters: [prometheus] + logs: + receivers: [otlp] + processors: [memory_limiter, batch, resource] + exporters: [otlphttp/loki] \ No newline at end of file diff --git a/k8s/base/files/prometheus/prometheus.yml b/k8s/base/files/prometheus/prometheus.yml new file mode 100644 index 0000000..4544f6c --- /dev/null +++ b/k8s/base/files/prometheus/prometheus.yml @@ -0,0 +1,8 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: otelcol + static_configs: + - targets: ["otelcol:8889"] \ No newline at end of file diff --git a/k8s/base/files/tempo/tempo-config.yml b/k8s/base/files/tempo/tempo-config.yml new file mode 100644 index 0000000..69d3bf6 --- /dev/null +++ b/k8s/base/files/tempo/tempo-config.yml @@ -0,0 +1,63 @@ +stream_over_http_enabled: true # Enables streaming traces over HTTP for better performance in high-throughput scenarios (in production evaluate grpc vs http and enable TLS/auth) + +server: + http_listen_port: 3200 # Tempo HTTP port for queries and ingestion (change if exposing externally; bind/port should follow infra policies) + log_level: info + +distributor: + receivers: + otlp: # Receives OTLP traces from collectors/agents (prefer grpc for high volume; secure endpoints in prod) + protocols: + grpc: + endpoint: "tempo:4317" # default gRPC OTLP port; use TLS/auth in production + http: + endpoint: "tempo:4318" # HTTP OTLP endpoint; disable if not used to reduce attack surface + +storage: + trace: + backend: local # Uses local filesystem for trace storage (demo). For production prefer object storage (s3/gcs/azure) for scalability and durability. + wal: + path: /var/tempo/wal # Write-ahead log for durability (ensure proper disk provisioning and rotation in prod) + local: + path: /var/tempo/blocks # Local storage path for trace blocks (demo). In production use cloud/remote block storage. + + +ingester: + max_block_duration: 5m # Cuts blocks every 5m for demo (production typically uses much longer, e.g., 1h+ to reduce compaction overhead) + +compactor: + compaction: + block_retention: 24h # Retains traces for 24h in demo. Set retention in production according to compliance/SLAs (days->months) and storage costs. + +query_frontend: + search: # SLOs for query performance – tune per workload to meet latency/throughput targets + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + metadata_slo: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s # Query-by-ID latency SLO (adjust lower in production for faster lookups) + +metrics_generator: + registry: + external_labels: # Labels attached to generated metrics (use meaningful cluster/service labels in prod) + source: tempo + cluster: demo + storage: + path: /var/tempo/generator/wal + remote_write: # Sends generated metrics to Prometheus via remote_write (ensure correct Prometheus URL and auth in prod) + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + traces_storage: + path: /var/tempo/generator/traces + +memberlist: + join_members: # Clustering for distributed setup; update with real peers/advertise_address and secure configs in production + - tempo:7946 + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics, local-blocks] # Enables service-graphs, span-metrics, local-blocks (these increase CPU/memory; enable selectively in prod) + generate_native_histograms: both diff --git a/k8s/base/grafana.yaml b/k8s/base/grafana.yaml new file mode 100644 index 0000000..bdbcac1 --- /dev/null +++ b/k8s/base/grafana.yaml @@ -0,0 +1,99 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-storage + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: grafana + image: grafana/grafana:12.1.1 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 3000 + name: http + envFrom: + - secretRef: + name: grafana-admin + env: + - name: GF_USERS_ALLOW_SIGN_UP + value: "false" + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: Viewer + volumeMounts: + - name: grafana-provisioning + mountPath: /etc/grafana/provisioning + readOnly: true + - name: grafana-dashboards + mountPath: /var/lib/grafana/dashboards + readOnly: true + - name: grafana-storage + mountPath: /var/lib/grafana + livenessProbe: + httpGet: + path: /api/health + port: http + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /api/health + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: grafana-provisioning + configMap: + name: grafana-provisioning + - name: grafana-dashboards + configMap: + name: grafana-dashboards + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana-storage +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: otel-lgtm-mvp + ports: + - name: http + port: 3000 + targetPort: http diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml new file mode 100644 index 0000000..7489474 --- /dev/null +++ b/k8s/base/kustomization.yaml @@ -0,0 +1,45 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: grafana-provisioning + files: + - files/grafana/provisioning/datasources/datasources.yml + - files/grafana/provisioning/alerting/alerts.yml + - files/grafana/provisioning/dashboards/dashboards.yml + - name: grafana-dashboards + files: + - files/grafana/dashboards/otel_mvp.json + - name: loki-config + files: + - files/loki/loki-config.yml + - name: tempo-config + files: + - files/tempo/tempo-config.yml + - name: prometheus-config + files: + - files/prometheus/prometheus.yml + - name: otelcol-config + files: + - files/otel-collector/otelcol-config.yml + +secretGenerator: + - name: grafana-admin + literals: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + +resources: + - namespace.yaml + - grafana.yaml + - loki.yaml + - tempo.yaml + - prometheus.yaml + - otelcol.yaml + - space-app.yaml + - loadgen.yaml diff --git a/k8s/base/loadgen.yaml b/k8s/base/loadgen.yaml new file mode 100644 index 0000000..3ef8e45 --- /dev/null +++ b/k8s/base/loadgen.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loadgen + labels: + app.kubernetes.io/name: loadgen + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: loadgen + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: loadgen + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: loadgen + image: ghcr.io/hyzhak/otel-lgtm-mvp/loadgen:latest + imagePullPolicy: IfNotPresent + env: + - name: TARGET_BASE_URL + value: http://space-app:8000 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi diff --git a/k8s/base/loki.yaml b/k8s/base/loki.yaml new file mode 100644 index 0000000..d92de26 --- /dev/null +++ b/k8s/base/loki.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: loki-data + labels: + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + volumeMode: Filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki + labels: + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: loki + image: grafana/loki:3.5.0 + imagePullPolicy: IfNotPresent + args: + - "-config.file=/etc/loki/config/loki-config.yml" + ports: + - containerPort: 3100 + name: http + volumeMounts: + - name: loki-config + mountPath: /etc/loki/config + readOnly: true + - name: loki-data + mountPath: /loki + volumes: + - name: loki-config + configMap: + name: loki-config + - name: loki-data + persistentVolumeClaim: + claimName: loki-data +--- +apiVersion: v1 +kind: Service +metadata: + name: loki + labels: + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: otel-lgtm-mvp + ports: + - name: http + port: 3100 + targetPort: http diff --git a/k8s/base/namespace.yaml b/k8s/base/namespace.yaml new file mode 100644 index 0000000..a86729f --- /dev/null +++ b/k8s/base/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: observability + labels: + app.kubernetes.io/part-of: otel-lgtm-mvp diff --git a/k8s/base/otelcol.yaml b/k8s/base/otelcol.yaml new file mode 100644 index 0000000..e5ee199 --- /dev/null +++ b/k8s/base/otelcol.yaml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otelcol + labels: + app.kubernetes.io/name: otelcol + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: otelcol + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: otelcol + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: otelcol + image: otel/opentelemetry-collector-contrib:0.133.0 + imagePullPolicy: IfNotPresent + args: + - "--config=/etc/otelcol/otelcol-config.yml" + ports: + - containerPort: 4317 + name: otlp-grpc + - containerPort: 4318 + name: otlp-http + - containerPort: 8889 + name: metrics + volumeMounts: + - name: otelcol-config + mountPath: /etc/otelcol + readOnly: true + volumes: + - name: otelcol-config + configMap: + name: otelcol-config +--- +apiVersion: v1 +kind: Service +metadata: + name: otelcol + labels: + app.kubernetes.io/name: otelcol + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + selector: + app.kubernetes.io/name: otelcol + app.kubernetes.io/part-of: otel-lgtm-mvp + ports: + - name: otlp-grpc + port: 4317 + targetPort: otlp-grpc + - name: otlp-http + port: 4318 + targetPort: otlp-http + - name: metrics + port: 8889 + targetPort: metrics diff --git a/k8s/base/prometheus.yaml b/k8s/base/prometheus.yaml new file mode 100644 index 0000000..7ab9252 --- /dev/null +++ b/k8s/base/prometheus.yaml @@ -0,0 +1,74 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prom-data + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + volumeMode: Filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: prometheus + image: prom/prometheus:v2.53.5 + imagePullPolicy: IfNotPresent + args: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=1d" + ports: + - containerPort: 9090 + name: http + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + readOnly: true + - name: prom-data + mountPath: /prometheus + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prom-data + persistentVolumeClaim: + claimName: prom-data +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: otel-lgtm-mvp + ports: + - name: http + port: 9090 + targetPort: http diff --git a/k8s/base/space-app.yaml b/k8s/base/space-app.yaml new file mode 100644 index 0000000..2df6ca5 --- /dev/null +++ b/k8s/base/space-app.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: space-app + labels: + app.kubernetes.io/name: space-app + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: space-app + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: space-app + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: space-app + image: ghcr.io/hyzhak/otel-lgtm-mvp/space-app:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 8000 + name: http + env: + - name: OTEL_SERVICE_NAME + value: space-app + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OTEL_RESOURCE_ATTRIBUTES + value: service.version=1.0.0,service.namespace=demo,service.instance.id=$(POD_NAME) + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: http/protobuf + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://otelcol:4318 + - name: OTEL_METRICS_EXPORTER + value: otlp + - name: OTEL_TRACES_EXPORTER + value: otlp + - name: OTEL_LOGS_EXPORTER + value: otlp + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 30 + periodSeconds: 30 +--- +apiVersion: v1 +kind: Service +metadata: + name: space-app + labels: + app.kubernetes.io/name: space-app + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: space-app + app.kubernetes.io/part-of: otel-lgtm-mvp + ports: + - name: http + port: 8000 + targetPort: http diff --git a/k8s/base/tempo.yaml b/k8s/base/tempo.yaml new file mode 100644 index 0000000..4fc8951 --- /dev/null +++ b/k8s/base/tempo.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tempo-data + labels: + app.kubernetes.io/name: tempo + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + volumeMode: Filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tempo + labels: + app.kubernetes.io/name: tempo + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tempo + app.kubernetes.io/part-of: otel-lgtm-mvp + template: + metadata: + labels: + app.kubernetes.io/name: tempo + app.kubernetes.io/part-of: otel-lgtm-mvp + spec: + containers: + - name: tempo + image: grafana/tempo:2.8.2 + imagePullPolicy: IfNotPresent + args: + - "-config.file=/etc/tempo/config/tempo-config.yml" + ports: + - containerPort: 3200 + name: http + volumeMounts: + - name: tempo-config + mountPath: /etc/tempo/config + readOnly: true + - name: tempo-data + mountPath: /var/tempo + volumes: + - name: tempo-config + configMap: + name: tempo-config + - name: tempo-data + persistentVolumeClaim: + claimName: tempo-data +--- +apiVersion: v1 +kind: Service +metadata: + name: tempo + labels: + app.kubernetes.io/name: tempo + app.kubernetes.io/part-of: otel-lgtm-mvp +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: tempo + app.kubernetes.io/part-of: otel-lgtm-mvp + ports: + - name: http + port: 3200 + targetPort: http diff --git a/k8s/overlays/local/kustomization.yaml b/k8s/overlays/local/kustomization.yaml new file mode 100644 index 0000000..ecac7d5 --- /dev/null +++ b/k8s/overlays/local/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability + +resources: + - ../../base + +images: + - name: ghcr.io/hyzhak/otel-lgtm-mvp/space-app + newName: space-app + newTag: latest + - name: ghcr.io/hyzhak/otel-lgtm-mvp/loadgen + newName: loadgen + newTag: latest + +patches: + - path: patches/image-pull-policy.yaml diff --git a/k8s/overlays/local/patches/image-pull-policy.yaml b/k8s/overlays/local/patches/image-pull-policy.yaml new file mode 100644 index 0000000..695a5a2 --- /dev/null +++ b/k8s/overlays/local/patches/image-pull-policy.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: space-app +spec: + template: + spec: + containers: + - name: space-app + imagePullPolicy: Never +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loadgen +spec: + template: + spec: + containers: + - name: loadgen + imagePullPolicy: Never diff --git a/k8s/overlays/production/ingress.yaml b/k8s/overlays/production/ingress.yaml new file mode 100644 index 0000000..8f9514d --- /dev/null +++ b/k8s/overlays/production/ingress.yaml @@ -0,0 +1,48 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana + annotations: + cert-manager.io/cluster-issuer: letsencrypt-production +# Replace example.com with real hostnames before deploying to a cluster. +spec: + ingressClassName: nginx + tls: + - hosts: + - grafana.example.com # placeholder hostname + secretName: grafana-tls + rules: + - host: grafana.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: grafana + port: + name: http +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: space-app + annotations: + cert-manager.io/cluster-issuer: letsencrypt-production +spec: + ingressClassName: nginx + tls: + - hosts: + - app.example.com # placeholder hostname + secretName: space-app-tls + rules: + - host: app.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: space-app + port: + name: http diff --git a/k8s/overlays/production/kustomization.yaml b/k8s/overlays/production/kustomization.yaml new file mode 100644 index 0000000..131cfea --- /dev/null +++ b/k8s/overlays/production/kustomization.yaml @@ -0,0 +1,13 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability + +resources: + - ../../base + - ingress.yaml + +patches: + - path: patches/storage-class.yaml + - path: patches/grafana-service.yaml + - path: patches/resource-limits.yaml diff --git a/k8s/overlays/production/patches/grafana-service.yaml b/k8s/overlays/production/patches/grafana-service.yaml new file mode 100644 index 0000000..32b97ac --- /dev/null +++ b/k8s/overlays/production/patches/grafana-service.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing +spec: + type: LoadBalancer diff --git a/k8s/overlays/production/patches/resource-limits.yaml b/k8s/overlays/production/patches/resource-limits.yaml new file mode 100644 index 0000000..d2fb3b0 --- /dev/null +++ b/k8s/overlays/production/patches/resource-limits.yaml @@ -0,0 +1,118 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana +spec: + template: + spec: + containers: + - name: grafana + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1 + memory: 1Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki +spec: + template: + spec: + containers: + - name: loki + resources: + requests: + cpu: 200m + memory: 1Gi + limits: + cpu: 1 + memory: 2Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tempo +spec: + template: + spec: + containers: + - name: tempo + resources: + requests: + cpu: 200m + memory: 1Gi + limits: + cpu: 1 + memory: 2Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus +spec: + template: + spec: + containers: + - name: prometheus + resources: + requests: + cpu: 200m + memory: 1Gi + limits: + cpu: 1 + memory: 2Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otelcol +spec: + template: + spec: + containers: + - name: otelcol + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: space-app +spec: + template: + spec: + containers: + - name: space-app + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loadgen +spec: + template: + spec: + containers: + - name: loadgen + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi diff --git a/k8s/overlays/production/patches/storage-class.yaml b/k8s/overlays/production/patches/storage-class.yaml new file mode 100644 index 0000000..18eb897 --- /dev/null +++ b/k8s/overlays/production/patches/storage-class.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-storage +spec: + storageClassName: gp3 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: loki-data +spec: + storageClassName: gp3 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tempo-data +spec: + storageClassName: gp3 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prom-data +spec: + storageClassName: gp3