Skip to content

Commit c799c30

Browse files
committed
feat: add complete infrastructure and monitoring stack
βœ… Infrastructure Services: - Enable Redis deployment for GenAI Celery broker - Enable Weaviate deployment for vector database - Add proper lifecycle management and resource limits πŸ“Š Monitoring Stack: - Add Prometheus for metrics collection - Add Grafana for dashboards with datasource provisioning - Add Loki for log aggregation - Add Promtail DaemonSet for log collection - Include proper RBAC and ConfigMaps for configuration πŸ”„ CI/CD Enhancements: - Add import strategy for all new services - Update retry logic for infrastructure and monitoring services - Add rollout status validation for all deployments and DaemonSets - Include targeted apply fallback for new services This completes the full docker-compose to Kubernetes migration with comprehensive monitoring.
1 parent 7f66ce7 commit c799c30

File tree

3 files changed

+696
-131
lines changed

3 files changed

+696
-131
lines changed

β€Ž.github/workflows/server-ci-cd.ymlβ€Ž

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ jobs:
172172
import_if_exists "service" "gateway-service" "kubernetes_service.gateway"
173173
import_if_exists "service" "files-service" "kubernetes_service.files"
174174
import_if_exists "service" "genai-app-service" "kubernetes_service.genai_app"
175+
import_if_exists "service" "redis" "kubernetes_service.redis"
176+
import_if_exists "service" "weaviate-service" "kubernetes_service.weaviate"
177+
import_if_exists "service" "prometheus" "kubernetes_service.prometheus"
178+
import_if_exists "service" "grafana" "kubernetes_service.grafana"
179+
import_if_exists "service" "loki" "kubernetes_service.loki"
175180
176181
# Import deployments
177182
echo "=== Importing Deployments ==="
@@ -181,6 +186,15 @@ jobs:
181186
import_if_exists "deployment" "files-service" "kubernetes_deployment.files"
182187
import_if_exists "deployment" "genai-app" "kubernetes_deployment.genai_app"
183188
import_if_exists "deployment" "genai-celery-worker" "kubernetes_deployment.genai_celery_worker"
189+
import_if_exists "deployment" "redis" "kubernetes_deployment.genai_redis"
190+
import_if_exists "deployment" "weaviate" "kubernetes_deployment.weaviate"
191+
import_if_exists "deployment" "prometheus" "kubernetes_deployment.prometheus"
192+
import_if_exists "deployment" "grafana" "kubernetes_deployment.grafana"
193+
import_if_exists "deployment" "loki" "kubernetes_deployment.loki"
194+
195+
# Import daemonsets
196+
echo "=== Importing DaemonSets ==="
197+
import_if_exists "daemonset" "promtail" "kubernetes_daemon_set.promtail"
184198
185199
# Import secrets
186200
echo "=== Importing Secrets ==="
@@ -250,6 +264,11 @@ jobs:
250264
terraform import kubernetes_service.gateway developmentv1/gateway-service 2>/dev/null || true
251265
terraform import kubernetes_service.files developmentv1/files-service 2>/dev/null || true
252266
terraform import kubernetes_service.genai_app developmentv1/genai-app-service 2>/dev/null || true
267+
terraform import kubernetes_service.redis developmentv1/redis 2>/dev/null || true
268+
terraform import kubernetes_service.weaviate developmentv1/weaviate-service 2>/dev/null || true
269+
terraform import kubernetes_service.prometheus developmentv1/prometheus 2>/dev/null || true
270+
terraform import kubernetes_service.grafana developmentv1/grafana 2>/dev/null || true
271+
terraform import kubernetes_service.loki developmentv1/loki 2>/dev/null || true
253272
254273
# Try apply again after re-import
255274
echo "Retrying apply after import recovery..."
@@ -259,7 +278,7 @@ jobs:
259278
echo "❌ Apply still failing, using targeted approach..."
260279
261280
# Try applying specific resources that are known to work
262-
terraform apply -auto-approve -target=kubernetes_deployment.user -target=kubernetes_deployment.group -target=kubernetes_deployment.gateway -target=kubernetes_deployment.files -target=kubernetes_deployment.genai_app
281+
terraform apply -auto-approve -target=kubernetes_deployment.user -target=kubernetes_deployment.group -target=kubernetes_deployment.gateway -target=kubernetes_deployment.files -target=kubernetes_deployment.genai_app -target=kubernetes_deployment.genai_celery_worker -target=kubernetes_deployment.genai_redis -target=kubernetes_deployment.weaviate -target=kubernetes_deployment.prometheus -target=kubernetes_deployment.grafana -target=kubernetes_deployment.loki
263282
echo "βœ… Targeted apply completed - some resources may need manual reconciliation"
264283
fi
265284
else
@@ -285,6 +304,14 @@ jobs:
285304
kubectl rollout status deployment/files-service -n developmentv1 --timeout=300s
286305
kubectl rollout status deployment/genai-app -n developmentv1 --timeout=300s
287306
kubectl rollout status deployment/genai-celery-worker -n developmentv1 --timeout=300s
307+
kubectl rollout status deployment/redis -n developmentv1 --timeout=300s || echo "Redis rollout check failed"
308+
kubectl rollout status deployment/weaviate -n developmentv1 --timeout=300s || echo "Weaviate rollout check failed"
309+
kubectl rollout status deployment/prometheus -n developmentv1 --timeout=300s || echo "Prometheus rollout check failed"
310+
kubectl rollout status deployment/grafana -n developmentv1 --timeout=300s || echo "Grafana rollout check failed"
311+
kubectl rollout status deployment/loki -n developmentv1 --timeout=300s || echo "Loki rollout check failed"
312+
313+
echo "Checking DaemonSet status..."
314+
kubectl rollout status daemonset/promtail -n developmentv1 --timeout=300s || echo "Promtail rollout check failed"
288315
289316
# Check pod health
290317
echo "Checking pod health..."

β€Žinfra/genai.tfβ€Ž

Lines changed: 169 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,136 +1,175 @@
1-
# Commenting out Redis deployment - using existing Redis pod
2-
# resource "kubernetes_deployment" "genai_redis" {
3-
# metadata {
4-
# name = "genai-redis"
5-
# namespace = var.namespace
6-
# labels = { app = "genai-redis" }
7-
# }
8-
# spec {
9-
# replicas = 1
10-
# selector { match_labels = { app = "genai-redis" } }
11-
# template {
12-
# metadata { labels = { app = "genai-redis" } }
13-
# spec {
14-
# container {
15-
# name = "redis"
16-
# image = "redis:7-alpine"
17-
# port {
18-
# container_port = 6379
19-
# }
20-
# }
21-
# }
22-
# }
23-
# }
24-
# }
1+
# Redis deployment for GenAI Celery broker
2+
resource "kubernetes_deployment" "genai_redis" {
3+
metadata {
4+
name = "redis"
5+
namespace = var.namespace
6+
labels = { app = "redis" }
7+
}
8+
9+
lifecycle {
10+
ignore_changes = [
11+
metadata[0].generation,
12+
metadata[0].resource_version,
13+
metadata[0].uid,
14+
spec[0].template[0].metadata[0].annotations
15+
]
16+
}
17+
18+
spec {
19+
replicas = 1
20+
selector { match_labels = { app = "redis" } }
21+
template {
22+
metadata { labels = { app = "redis" } }
23+
spec {
24+
container {
25+
name = "redis"
26+
image = "redis:7-alpine"
27+
port {
28+
container_port = 6379
29+
}
30+
resources {
31+
requests = {
32+
memory = "256Mi"
33+
cpu = "100m"
34+
}
35+
limits = {
36+
memory = "512Mi"
37+
cpu = "500m"
38+
}
39+
}
40+
}
41+
}
42+
}
43+
}
44+
}
2545

26-
# Using existing Redis service instead of creating new one
27-
# resource "kubernetes_service" "genai_redis" {
28-
# metadata {
29-
# name = "genai-redis"
30-
# namespace = var.namespace
31-
# labels = { app = "genai-redis" }
32-
# }
33-
# spec {
34-
# selector = { app = "genai-redis" }
35-
# port {
36-
# port = 6379
37-
# target_port = 6379
38-
# }
39-
# type = "ClusterIP"
40-
# }
41-
# }
46+
resource "kubernetes_service" "redis" {
47+
metadata {
48+
name = "redis"
49+
namespace = var.namespace
50+
labels = { app = "redis" }
51+
}
52+
spec {
53+
selector = { app = "redis" }
54+
port {
55+
port = 6379
56+
target_port = 6379
57+
}
58+
type = "ClusterIP"
59+
}
60+
}
4261

43-
# Commenting out Weaviate deployment - using existing Weaviate pod
44-
# resource "kubernetes_deployment" "weaviate" {
45-
# metadata {
46-
# name = "genai-weaviate"
47-
# namespace = var.namespace
48-
# labels = { app = "genai-weaviate" }
49-
# }
50-
# spec {
51-
# replicas = 1
52-
# selector { match_labels = { app = "genai-weaviate" } }
53-
# template {
54-
# metadata { labels = { app = "genai-weaviate" } }
55-
# spec {
56-
# volume {
57-
# name = "weaviate-data"
58-
# empty_dir {}
59-
# }
60-
#
61-
# container {
62-
# name = "weaviate"
63-
# image = "semitechnologies/weaviate:1.23.7"
64-
# port {
65-
# container_port = 8080
66-
# }
67-
# port {
68-
# container_port = 50051
69-
# }
70-
#
71-
# volume_mount {
72-
# name = "weaviate-data"
73-
# mount_path = "/var/lib/weaviate"
74-
# }
75-
#
76-
# env_from {
77-
# secret_ref {
78-
# name = kubernetes_secret.openai_credentials.metadata[0].name
79-
# }
80-
# }
81-
#
82-
# env {
83-
# name = "QUERY_DEFAULTS_LIMIT"
84-
# value = "25"
85-
# }
86-
# env {
87-
# name = "AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED"
88-
# value = "true"
89-
# }
90-
# env {
91-
# name = "PERSISTENCE_DATA_PATH"
92-
# value = "/var/lib/weaviate"
93-
# }
94-
# env {
95-
# name = "DEFAULT_VECTORIZER_MODULE"
96-
# value = "text2vec-openai"
97-
# }
98-
# env {
99-
# name = "ENABLE_MODULES"
100-
# value = "text2vec-openai,generative-openai"
101-
# }
102-
# env {
103-
# name = "CLUSTER_HOSTNAME"
104-
# value = "node1"
105-
# }
106-
# }
107-
# }
108-
# }
109-
# }
110-
# }
62+
# Weaviate deployment for GenAI vector database
63+
resource "kubernetes_deployment" "weaviate" {
64+
metadata {
65+
name = "weaviate"
66+
namespace = var.namespace
67+
labels = { app = "weaviate" }
68+
}
69+
70+
lifecycle {
71+
ignore_changes = [
72+
metadata[0].generation,
73+
metadata[0].resource_version,
74+
metadata[0].uid,
75+
spec[0].template[0].metadata[0].annotations
76+
]
77+
}
78+
79+
spec {
80+
replicas = 1
81+
selector { match_labels = { app = "weaviate" } }
82+
template {
83+
metadata { labels = { app = "weaviate" } }
84+
spec {
85+
volume {
86+
name = "weaviate-data"
87+
empty_dir {}
88+
}
89+
90+
container {
91+
name = "weaviate"
92+
image = "semitechnologies/weaviate:1.23.7"
93+
port {
94+
container_port = 8080
95+
}
96+
port {
97+
container_port = 50051
98+
}
99+
100+
volume_mount {
101+
name = "weaviate-data"
102+
mount_path = "/var/lib/weaviate"
103+
}
104+
105+
env_from {
106+
secret_ref {
107+
name = kubernetes_secret.openai_credentials.metadata[0].name
108+
}
109+
}
110+
111+
env {
112+
name = "QUERY_DEFAULTS_LIMIT"
113+
value = "25"
114+
}
115+
env {
116+
name = "AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED"
117+
value = "true"
118+
}
119+
env {
120+
name = "PERSISTENCE_DATA_PATH"
121+
value = "/var/lib/weaviate"
122+
}
123+
env {
124+
name = "DEFAULT_VECTORIZER_MODULE"
125+
value = "text2vec-openai"
126+
}
127+
env {
128+
name = "ENABLE_MODULES"
129+
value = "text2vec-openai,generative-openai"
130+
}
131+
env {
132+
name = "CLUSTER_HOSTNAME"
133+
value = "node1"
134+
}
135+
136+
resources {
137+
requests = {
138+
memory = "1Gi"
139+
cpu = "500m"
140+
}
141+
limits = {
142+
memory = "2Gi"
143+
cpu = "1000m"
144+
}
145+
}
146+
}
147+
}
148+
}
149+
}
150+
}
111151

112-
# Using existing Weaviate service instead of creating new one
113-
# resource "kubernetes_service" "weaviate" {
114-
# metadata {
115-
# name = "genai-weaviate"
116-
# namespace = var.namespace
117-
# labels = { app = "genai-weaviate" }
118-
# }
119-
# spec {
120-
# selector = { app = "genai-weaviate" }
121-
# port {
122-
# name = "http"
123-
# port = 8080
124-
# target_port = 8080
125-
# }
126-
# port {
127-
# name = "grpc"
128-
# port = 50051
129-
# target_port = 50051
130-
# }
131-
# type = "ClusterIP"
132-
# }
133-
# }
152+
resource "kubernetes_service" "weaviate" {
153+
metadata {
154+
name = "weaviate-service"
155+
namespace = var.namespace
156+
labels = { app = "weaviate" }
157+
}
158+
spec {
159+
selector = { app = "weaviate" }
160+
port {
161+
name = "http"
162+
port = 8080
163+
target_port = 8080
164+
}
165+
port {
166+
name = "grpc"
167+
port = 50051
168+
target_port = 50051
169+
}
170+
type = "ClusterIP"
171+
}
172+
}
134173

135174
# GenAI App deployment
136175
resource "kubernetes_deployment" "genai_app" {

0 commit comments

Comments
Β (0)