Skip to content

Commit 2cd2740

Browse files
committed
Add opentelemetry
Signed-off-by: David Kornel <kornys@outlook.com>
1 parent 88a6863 commit 2cd2740

9 files changed

Lines changed: 276 additions & 174 deletions

File tree

AGENTS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,10 @@ public class DiagnosticTools {
244244
- **Cancellation**: Checks `DiagnosticHelper.checkCancellation()` between steps
245245
- **No duplication**: Calls existing domain services (KafkaService, StrimziOperatorService, etc.)
246246
- **Configurable token limits**: `mcp.sampling.triage-max-tokens` and `mcp.sampling.analysis-max-tokens`
247+
- **OpenTelemetry tracing**: Gather/triage/analysis methods are annotated with `@WithSpan` for
248+
distributed tracing. Methods are package-private (not private) so Quarkus ArC subclass-based
249+
interception can intercept self-invocations from `diagnose()`. Span names follow the pattern
250+
`diagnose.{domain}.{step}` (e.g., `diagnose.cluster.status`, `diagnose.connectivity.triage`).
247251

248252
### DiagnosticHelper (common module)
249253

docs/strimzi-mcp/configuration.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,51 @@ MCP_SAMPLING_ANALYSIS_MAX_TOKENS=2000
228228

229229
See [Diagnostic tools](tools/diagnostics.md) for more information.
230230

231+
### OpenTelemetry tracing
232+
233+
Enable distributed tracing to observe diagnostic tool performance and debug slow responses.
234+
When enabled, the server exports traces via OTLP to a collector (e.g., Jaeger, Grafana Tempo).
235+
236+
| Property | Default | Description |
237+
|----------|---------|-------------|
238+
| `quarkus.otel.enabled` | `false` | Enable OpenTelemetry tracing |
239+
| `quarkus.otel.exporter.otlp.endpoint` | `http://localhost:4317` | OTLP collector endpoint (gRPC) |
240+
| `quarkus.otel.service.name` | `strimzi-mcp` | Service name in traces |
241+
| `quarkus.otel.propagators` | `tracecontext,baggage` | Context propagation formats |
242+
243+
**Auto-instrumented spans** (no configuration needed):
244+
245+
- HTTP server requests (incoming MCP calls)
246+
- REST client calls (Prometheus, Loki)
247+
- Kubernetes client HTTP calls (via OkHttp instrumentation)
248+
249+
**Custom diagnostic spans:**
250+
251+
Each diagnostic tool invocation produces child spans for every step:
252+
`diagnose.cluster.status`, `diagnose.cluster.node_pools`, `diagnose.cluster.pods`,
253+
`diagnose.cluster.triage`, `diagnose.cluster.analysis`, etc.
254+
255+
To enable with a Jaeger collector:
256+
257+
```bash
258+
QUARKUS_OTEL_ENABLED=true
259+
QUARKUS_OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger-collector:4317
260+
```
261+
262+
In Kubernetes:
263+
264+
```yaml
265+
apiVersion: v1
266+
kind: ConfigMap
267+
metadata:
268+
name: strimzi-mcp-config
269+
data:
270+
QUARKUS_OTEL_ENABLED: "true"
271+
QUARKUS_OTEL_EXPORTER_OTLP_ENDPOINT: "http://jaeger-collector.observability:4317"
272+
```
273+
274+
Tracing is disabled by default and enabled automatically in the `prod` profile.
275+
231276
### Resource watch configuration
232277

233278
Control Kubernetes resource watches that send MCP notifications when resources change.

strimzi-mcp/pom.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
<groupId>io.streamshub</groupId>
3131
<artifactId>streamshub-loki-log-provider</artifactId>
3232
</dependency>
33+
<dependency>
34+
<groupId>io.quarkus</groupId>
35+
<artifactId>quarkus-opentelemetry</artifactId>
36+
</dependency>
3337
<dependency>
3438
<groupId>io.quarkus</groupId>
3539
<artifactId>quarkus-container-image-jib</artifactId>

strimzi-mcp/src/main/java/io/streamshub/mcp/strimzi/service/KafkaClusterDiagnosticService.java

Lines changed: 67 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package io.streamshub.mcp.strimzi.service;
66

77
import com.fasterxml.jackson.databind.ObjectMapper;
8+
import io.opentelemetry.instrumentation.annotations.WithSpan;
89
import io.quarkiverse.mcp.server.Cancellation;
910
import io.quarkiverse.mcp.server.Elicitation;
1011
import io.quarkiverse.mcp.server.McpLog;
@@ -210,11 +211,12 @@ public KafkaClusterDiagnosticReport diagnose(final String namespace,
210211

211212
// ---- Phase 1: Initial data gathering ----
212213

213-
private KafkaClusterResponse gatherClusterStatus(final String namespace,
214-
final String clusterName,
215-
final Elicitation elicitation,
216-
final List<String> completed,
217-
final McpLog mcpLog) {
214+
@WithSpan("diagnose.cluster.status")
215+
KafkaClusterResponse gatherClusterStatus(final String namespace,
216+
final String clusterName,
217+
final Elicitation elicitation,
218+
final List<String> completed,
219+
final McpLog mcpLog) {
218220
try {
219221
KafkaClusterResponse result = kafkaService.getCluster(namespace, clusterName);
220222
completed.add(STEP_CLUSTER_STATUS);
@@ -231,11 +233,12 @@ private KafkaClusterResponse gatherClusterStatus(final String namespace,
231233
}
232234
}
233235

234-
private List<KafkaNodePoolResponse> gatherNodePools(final String namespace,
235-
final String clusterName,
236-
final List<String> completed,
237-
final List<String> failed,
238-
final McpLog mcpLog) {
236+
@WithSpan("diagnose.cluster.node_pools")
237+
List<KafkaNodePoolResponse> gatherNodePools(final String namespace,
238+
final String clusterName,
239+
final List<String> completed,
240+
final List<String> failed,
241+
final McpLog mcpLog) {
239242
try {
240243
List<KafkaNodePoolResponse> result = nodePoolService.listNodePools(namespace, clusterName);
241244
completed.add(STEP_NODE_POOLS);
@@ -248,11 +251,12 @@ private List<KafkaNodePoolResponse> gatherNodePools(final String namespace,
248251
}
249252
}
250253

251-
private KafkaClusterPodsResponse gatherClusterPods(final String namespace,
252-
final String clusterName,
253-
final List<String> completed,
254-
final List<String> failed,
255-
final McpLog mcpLog) {
254+
@WithSpan("diagnose.cluster.pods")
255+
KafkaClusterPodsResponse gatherClusterPods(final String namespace,
256+
final String clusterName,
257+
final List<String> completed,
258+
final List<String> failed,
259+
final McpLog mcpLog) {
256260
try {
257261
KafkaClusterPodsResponse result = kafkaService.getClusterPods(namespace, clusterName);
258262
completed.add(STEP_POD_HEALTH);
@@ -267,10 +271,11 @@ private KafkaClusterPodsResponse gatherClusterPods(final String namespace,
267271

268272
// ---- Phase 2: Deep investigation ----
269273

270-
private StrimziOperatorResponse gatherOperatorStatus(final String namespace,
271-
final List<String> completed,
272-
final List<String> failed,
273-
final McpLog mcpLog) {
274+
@WithSpan("diagnose.cluster.operator_status")
275+
StrimziOperatorResponse gatherOperatorStatus(final String namespace,
276+
final List<String> completed,
277+
final List<String> failed,
278+
final McpLog mcpLog) {
274279
try {
275280
List<StrimziOperatorResponse> operators = operatorService.listOperators(namespace);
276281
if (!operators.isEmpty()) {
@@ -288,11 +293,12 @@ private StrimziOperatorResponse gatherOperatorStatus(final String namespace,
288293
}
289294
}
290295

291-
private StrimziOperatorLogsResponse gatherOperatorLogs(final String namespace,
292-
final Integer sinceMinutes,
293-
final List<String> completed,
294-
final List<String> failed,
295-
final McpLog mcpLog) {
296+
@WithSpan("diagnose.cluster.operator_logs")
297+
StrimziOperatorLogsResponse gatherOperatorLogs(final String namespace,
298+
final Integer sinceMinutes,
299+
final List<String> completed,
300+
final List<String> failed,
301+
final McpLog mcpLog) {
296302
try {
297303
StrimziOperatorLogsResponse result = operatorService.getOperatorLogs(
298304
namespace, null, buildErrorLogParams(sinceMinutes));
@@ -306,12 +312,13 @@ private StrimziOperatorLogsResponse gatherOperatorLogs(final String namespace,
306312
}
307313
}
308314

309-
private KafkaClusterLogsResponse gatherClusterLogs(final String namespace,
310-
final String clusterName,
311-
final Integer sinceMinutes,
312-
final List<String> completed,
313-
final List<String> failed,
314-
final McpLog mcpLog) {
315+
@WithSpan("diagnose.cluster.logs")
316+
KafkaClusterLogsResponse gatherClusterLogs(final String namespace,
317+
final String clusterName,
318+
final Integer sinceMinutes,
319+
final List<String> completed,
320+
final List<String> failed,
321+
final McpLog mcpLog) {
315322
try {
316323
KafkaClusterLogsResponse result = kafkaService.getClusterLogs(
317324
namespace, clusterName, buildErrorLogParams(sinceMinutes));
@@ -325,12 +332,13 @@ private KafkaClusterLogsResponse gatherClusterLogs(final String namespace,
325332
}
326333
}
327334

328-
private StrimziEventsResponse gatherEvents(final String namespace,
329-
final String clusterName,
330-
final Integer sinceMinutes,
331-
final List<String> completed,
332-
final List<String> failed,
333-
final McpLog mcpLog) {
335+
@WithSpan("diagnose.cluster.events")
336+
StrimziEventsResponse gatherEvents(final String namespace,
337+
final String clusterName,
338+
final Integer sinceMinutes,
339+
final List<String> completed,
340+
final List<String> failed,
341+
final McpLog mcpLog) {
334342
try {
335343
StrimziEventsResponse result = eventsService.getClusterEvents(
336344
namespace, clusterName, sinceMinutes);
@@ -344,11 +352,12 @@ private StrimziEventsResponse gatherEvents(final String namespace,
344352
}
345353
}
346354

347-
private KafkaMetricsResponse gatherMetrics(final String namespace,
348-
final String clusterName,
349-
final List<String> completed,
350-
final List<String> failed,
351-
final McpLog mcpLog) {
355+
@WithSpan("diagnose.cluster.metrics")
356+
KafkaMetricsResponse gatherMetrics(final String namespace,
357+
final String clusterName,
358+
final List<String> completed,
359+
final List<String> failed,
360+
final McpLog mcpLog) {
352361
try {
353362
KafkaMetricsResponse result = kafkaMetricsService.getKafkaMetrics(
354363
namespace, clusterName, "replication", null, null, null, null, null);
@@ -364,11 +373,12 @@ private KafkaMetricsResponse gatherMetrics(final String namespace,
364373

365374
// ---- Sampling: triage and analysis ----
366375

367-
private InvestigationAreas decideInvestigationAreas(final Sampling sampling,
368-
final KafkaClusterResponse cluster,
369-
final List<KafkaNodePoolResponse> nodePools,
370-
final KafkaClusterPodsResponse pods,
371-
final String symptom) {
376+
@WithSpan("diagnose.cluster.triage")
377+
InvestigationAreas decideInvestigationAreas(final Sampling sampling,
378+
final KafkaClusterResponse cluster,
379+
final List<KafkaNodePoolResponse> nodePools,
380+
final KafkaClusterPodsResponse pods,
381+
final String symptom) {
372382
if (sampling == null || !sampling.isSupported()) {
373383
return InvestigationAreas.all();
374384
}
@@ -392,17 +402,18 @@ private InvestigationAreas decideInvestigationAreas(final Sampling sampling,
392402
}
393403
}
394404

405+
@WithSpan("diagnose.cluster.analysis")
395406
@SuppressWarnings("checkstyle:ParameterNumber")
396-
private String produceAnalysis(final Sampling sampling,
397-
final KafkaClusterResponse cluster,
398-
final List<KafkaNodePoolResponse> nodePools,
399-
final KafkaClusterPodsResponse pods,
400-
final StrimziOperatorResponse operator,
401-
final StrimziOperatorLogsResponse operatorLogs,
402-
final KafkaClusterLogsResponse clusterLogs,
403-
final StrimziEventsResponse events,
404-
final KafkaMetricsResponse metrics,
405-
final String symptom) {
407+
String produceAnalysis(final Sampling sampling,
408+
final KafkaClusterResponse cluster,
409+
final List<KafkaNodePoolResponse> nodePools,
410+
final KafkaClusterPodsResponse pods,
411+
final StrimziOperatorResponse operator,
412+
final StrimziOperatorLogsResponse operatorLogs,
413+
final KafkaClusterLogsResponse clusterLogs,
414+
final StrimziEventsResponse events,
415+
final KafkaMetricsResponse metrics,
416+
final String symptom) {
406417
if (sampling == null || !sampling.isSupported()) {
407418
return null;
408419
}

0 commit comments

Comments
 (0)