diff --git a/src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java b/src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java new file mode 100644 index 000000000..8bf9390b4 --- /dev/null +++ b/src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java @@ -0,0 +1,207 @@ +/* + * Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Modifications copyright (C) 2017 Uber Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License is + * located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.uber.cadence.internal.metrics; + +import com.uber.m3.tally.DurationBuckets; +import com.uber.m3.util.Duration; +import java.util.concurrent.TimeUnit; + +/** + * Histogram bucket configurations for timer metrics migration. + * + *

This class defines standard histogram bucket configurations used during the migration from + * timers to histograms. These buckets provide consistent granularity for measuring latencies across + * different time ranges. + * + *

Note: Unlike the Go client which uses subsettable exponential histograms with algorithmic + * bucket generation, the Java client uses explicit bucket definitions. We provide multiple + * configurations to balance between granularity and cardinality: + * + *

+ */ +public final class HistogramBuckets { + + /** + * Default bucket configuration for most client-side latency metrics. + * + *

Range: 1ms to 100s + * + *

Provides: - Fine-grained buckets (1ms steps) from 1ms to 10ms - Medium-grained buckets (10ms + * steps) from 10ms to 100ms - Coarser buckets (100ms steps) from 100ms to 1s - Second-level + * buckets from 1s to 100s + * + *

Use for: - Decision poll latency - Activity poll latency - Decision execution latency - + * Activity execution latency - Workflow replay latency - Most RPC call latencies + */ + public static final DurationBuckets DEFAULT_1MS_100S = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), // 1ms + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(3)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(4)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(6)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(7)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(8)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(9)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(40)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(60)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(70)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(80)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(90)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(300)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(400)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(600)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(700)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(800)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(900)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(3)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(4)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(6)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(7)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(8)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(9)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(40)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(70)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(80)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(90)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(100))); + + /** + * Low-resolution bucket configuration for high-cardinality metrics. + * + *

Range: 1ms to 100s (same as DEFAULT_1MS_100S but with fewer buckets) + * + *

Provides: - Coarser buckets with ~2x steps instead of fine-grained steps - Approximately + * half the cardinality of DEFAULT_1MS_100S + * + *

Use for: - Per-activity-type metrics where cardinality is high - Per-workflow-type metrics + * where cardinality is high - Metrics with many tag combinations + */ + public static final DurationBuckets LOW_1MS_100S = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(100))); + + /** + * High-resolution bucket configuration for long-running operations. + * + *

Range: 1ms to 24 hours + * + *

Provides: - Fine-grained buckets from 1ms to 10ms - Medium-grained from 10ms to 1s - + * Second-level buckets from 1s to 10 minutes - Minute-level buckets from 10 minutes to 24 hours + * + *

Use for: - Workflow end-to-end latency - Long-running activity execution latency - Multi-day + * operation metrics + */ + public static final DurationBuckets HIGH_1MS_24H = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(120)), // 2 min + Duration.ofNanos(TimeUnit.SECONDS.toNanos(300)), // 5 min + Duration.ofNanos(TimeUnit.SECONDS.toNanos(600)), // 10 min + Duration.ofNanos(TimeUnit.MINUTES.toNanos(20)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(1)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(2)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(4)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(8)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(12)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(24))); + + /** + * Medium-resolution bucket configuration for long-running operations. + * + *

Range: 1ms to 24 hours (same as HIGH_1MS_24H but with fewer buckets) + * + *

Provides: - Coarser buckets than HIGH_1MS_24H - Better for high-cardinality long-duration + * metrics + * + *

Use for: - When HIGH_1MS_24H's cardinality is too high - Per-workflow-type E2E latency with + * many workflow types + */ + public static final DurationBuckets MID_1MS_24H = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(1)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(5)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(10)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(1)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(4)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(12)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(24))); + + private HistogramBuckets() { + // Utility class - prevent instantiation + } +} diff --git a/src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md b/src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md new file mode 100644 index 000000000..547e619bb --- /dev/null +++ b/src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md @@ -0,0 +1,173 @@ +# Timer to Histogram Migration + +## Overview + +This document describes the migration from timer metrics to histogram metrics in the Cadence Java client. The migration uses a dual-emit pattern where **both timer and histogram metrics are always emitted**, allowing for gradual migration of dashboards and alerts without requiring a coordinated flag day. + +## Why Migrate? + +Timers and histograms serve similar purposes (measuring latencies and durations) but have different characteristics: + +- **Timers**: Legacy approach, currently used throughout the codebase +- **Histograms**: More flexible, better support for custom buckets and percentile calculations + +## Migration Strategy + +### Phase 1: Dual Emission (Current) + +Both timer and histogram metrics are emitted simultaneously: + +```java +// Old code: +Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start(); +// ... do work ... +sw.stop(); + +// New code (dual emit): +DualStopwatch sw = MetricsEmit.startLatency( + scope, + MetricsType.DECISION_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +); +// ... do work ... +sw.stop(); // Records to BOTH timer and histogram +``` + +### Phase 2: Dashboard/Alert Migration (Next) + +Update all dashboards and alerts to use histogram metrics instead of timer metrics. This can be done gradually since both are being emitted. + +### Phase 3: Remove Timer Emission (Future) + +Once all dashboards/alerts are migrated, remove timer emission: + +```java +// Future code (histogram only): +Stopwatch sw = scope.histogram( + MetricsType.DECISION_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +).start(); +// ... do work ... +sw.stop(); +``` + +## Helper Classes + +### HistogramBuckets + +Defines standard bucket configurations: + +- `DEFAULT_1MS_100S`: For most latency measurements (1ms to 100s range) + - Fine-grained: 1ms steps from 1-10ms + - Medium-grained: 10ms steps from 10-100ms + - Coarse: 100ms steps from 100ms-1s + - Second-level: 1s steps from 1-100s + - Use for: Most RPC calls, decision/activity poll, execution latencies + +- `LOW_1MS_100S`: Low-resolution version for high-cardinality metrics (1ms to 100s) + - Approximately half the buckets of DEFAULT_1MS_100S + - Use for: Per-activity-type, per-workflow-type metrics with high cardinality + +- `HIGH_1MS_24H`: For long-running operations (1ms to 24 hours) + - Extended range for multi-hour workflows + - Use for: Workflow end-to-end latency, long-running activities + +- `MID_1MS_24H`: Lower-resolution version of HIGH_1MS_24H + - Fewer buckets than HIGH_1MS_24H + - Use for: When HIGH_1MS_24H's cardinality is too high + +### MetricsEmit + +Provides dual-emit helper methods: + +- `emitLatency(scope, name, duration, buckets)`: Directly record a duration +- `startLatency(scope, name, buckets)`: Create a dual stopwatch + +### DualStopwatch + +A stopwatch wrapper that records to both timer and histogram when `.stop()` is called. + +## Migration Checklist + +For each timer metric: + +1. ✅ Identify the timer usage (e.g., `scope.timer(name).start()`) +2. ✅ Replace with `MetricsEmit.startLatency(scope, name, buckets)` +3. ✅ Choose appropriate bucket configuration (typically `HistogramBuckets.DEFAULT_1MS_100S`) +4. ✅ Verify both metrics are being emitted +5. ⏳ Update dashboards to use histogram metric +6. ⏳ Update alerts to use histogram metric +7. ⏳ (Future) Remove timer emission + +## Example Conversions + +### Example 1: Poll Latency + +```java +// Before: +Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start(); +PollForDecisionTaskResponse result = service.PollForDecisionTask(request); +sw.stop(); + +// After: +DualStopwatch sw = MetricsEmit.startLatency( + scope, + MetricsType.DECISION_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +); +PollForDecisionTaskResponse result = service.PollForDecisionTask(request); +sw.stop(); +``` + +### Example 2: Execution Latency + +```java +// Before: +Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_EXEC_LATENCY).start(); +Result response = handler.handle(task, metricsScope, false); +sw.stop(); + +// After: +DualStopwatch sw = MetricsEmit.startLatency( + metricsScope, + MetricsType.ACTIVITY_EXEC_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +); +Result response = handler.handle(task, metricsScope, false); +sw.stop(); +``` + +### Example 3: Direct Duration Recording + +```java +// Before: +Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime); +scope.timer(MetricsType.DECISION_SCHEDULED_TO_START_LATENCY).record(scheduledToStartLatency); + +// After: +Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime); +MetricsEmit.emitLatency( + scope, + MetricsType.DECISION_SCHEDULED_TO_START_LATENCY, + scheduledToStartLatency, + HistogramBuckets.DEFAULT_1MS_100S +); +``` + +## Testing + +The migration preserves existing timer behavior while adding histogram emission, so: + +- Existing timer-based tests continue to work +- Existing timer-based dashboards/alerts continue to work +- New histogram metrics are available for gradual migration + +## Timeline + +1. **Now**: Dual emission in place, both metrics available +2. **Next Quarter**: Migrate dashboards and alerts to histograms +3. **Future Release**: Remove timer emission, histogram-only + +## Questions? + +Contact the Cadence team for guidance on specific metrics or migration questions. diff --git a/src/main/java/com/uber/cadence/internal/metrics/MetricsEmit.java b/src/main/java/com/uber/cadence/internal/metrics/MetricsEmit.java new file mode 100644 index 000000000..184409a4e --- /dev/null +++ b/src/main/java/com/uber/cadence/internal/metrics/MetricsEmit.java @@ -0,0 +1,214 @@ +/* + * Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Modifications copyright (C) 2017 Uber Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License is + * located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.uber.cadence.internal.metrics; + +import com.uber.m3.tally.DurationBuckets; +import com.uber.m3.tally.Scope; +import com.uber.m3.tally.Stopwatch; +import com.uber.m3.util.Duration; + +/** + * Helper utilities for dual-emitting metrics during timer to histogram migration. + * + *

This class provides utilities to support gradual migration from timer metrics to histogram + * metrics. By default, both timer and histogram metrics are emitted to support gradual + * dashboard/alert migration without requiring a flag day. + * + *

Migration path: 1. Use these helpers to emit both timers and histograms (default behavior) 2. + * Update dashboards/alerts to use histogram metrics 3. In a future release, remove timer emission + * and use histograms exclusively + * + *

Example usage: + * + *

{@code
+ * // Direct latency recording
+ * Duration latency = Duration.ofMillis(150);
+ * MetricsEmit.emitLatency(scope, MetricsType.DECISION_POLL_LATENCY, latency, HistogramBuckets.DEFAULT_1MS_100S);
+ *
+ * // Using stopwatch
+ * DualStopwatch sw = MetricsEmit.startLatency(scope, MetricsType.ACTIVITY_EXEC_LATENCY, HistogramBuckets.DEFAULT_1MS_100S);
+ * // ... do work ...
+ * sw.stop();
+ *
+ * // Configure emission mode (optional, typically done at application startup)
+ * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_HISTOGRAMS_ONLY);
+ * }
+ */ +public final class MetricsEmit { + + /** Metric emission mode controls which metrics are emitted for latency measurements. */ + public enum MetricEmitMode { + /** Emit only timer metrics (legacy OSS behavior) */ + EMIT_TIMERS_ONLY, + /** Emit both timer and histogram metrics (default for migration) */ + EMIT_BOTH, + /** Emit only histogram metrics (post-migration) */ + EMIT_HISTOGRAMS_ONLY + } + + /** + * Current emission mode. Default is EMIT_BOTH for migration. This should be set during + * application initialization (e.g., in static initializer or before starting workers). It should + * NOT be changed dynamically after workers have started. + */ + private static volatile MetricEmitMode currentEmitMode = MetricEmitMode.EMIT_BOTH; + + /** + * Configures the metric emission strategy. This should be called during application + * initialization, before any metrics are emitted. + * + * @param mode The emission mode to use + *

Example usage: + *

{@code
+   * // To use only timers (legacy behavior)
+   * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_TIMERS_ONLY);
+   *
+   * // To use both (default, for migration)
+   * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_BOTH);
+   *
+   * // To use only histograms (post-migration)
+   * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_HISTOGRAMS_ONLY);
+   * }
+ */ + public static void setEmitMode(MetricEmitMode mode) { + if (mode == null) { + throw new IllegalArgumentException("MetricEmitMode cannot be null"); + } + currentEmitMode = mode; + } + + /** + * Returns the current emission mode. + * + * @return The current emission mode + */ + public static MetricEmitMode getEmitMode() { + return currentEmitMode; + } + + /** + * Records latency based on the current emit mode setting. + * + *

This helper function supports flexible metric emission during timer→histogram migration. The + * behavior depends on the current emit mode: + * + *

+ * + * @param scope The tally scope to emit metrics to + * @param name The metric name (without suffix) + * @param latency The duration to record + * @param buckets The histogram bucket configuration to use + */ + public static void emitLatency( + Scope scope, String name, Duration latency, DurationBuckets buckets) { + switch (currentEmitMode) { + case EMIT_TIMERS_ONLY: + scope.timer(name).record(latency); + break; + case EMIT_BOTH: + scope.timer(name).record(latency); + scope.histogram(name, buckets).recordDuration(latency); + break; + case EMIT_HISTOGRAMS_ONLY: + scope.histogram(name, buckets).recordDuration(latency); + break; + } + } + + /** + * Creates a stopwatch that emits based on current emit mode setting. + * + *

Call .stop() on the returned stopwatch to record the duration. The behavior depends on the + * current emit mode. + * + * @param scope The tally scope to emit metrics to + * @param name The metric name (without suffix) + * @param buckets The histogram bucket configuration to use + * @return A dual stopwatch that records based on emit mode + */ + public static DualStopwatch startLatency(Scope scope, String name, DurationBuckets buckets) { + MetricEmitMode mode = currentEmitMode; + Stopwatch timerSW = null; + Stopwatch histogramSW = null; + + switch (mode) { + case EMIT_TIMERS_ONLY: + timerSW = scope.timer(name).start(); + break; + case EMIT_BOTH: + timerSW = scope.timer(name).start(); + histogramSW = scope.histogram(name, buckets).start(); + break; + case EMIT_HISTOGRAMS_ONLY: + histogramSW = scope.histogram(name, buckets).start(); + break; + } + + return new DualStopwatch(timerSW, histogramSW, mode); + } + + /** + * A stopwatch that emits metrics based on the emit mode setting. + * + *

This supports flexible metric emission during timer→histogram migration. The metrics emitted + * depend on the mode captured when the stopwatch was started. + */ + public static class DualStopwatch { + private final Stopwatch timerSW; + private final Stopwatch histogramSW; + private final MetricEmitMode mode; + + DualStopwatch(Stopwatch timerSW, Stopwatch histogramSW, MetricEmitMode mode) { + this.timerSW = timerSW; + this.histogramSW = histogramSW; + this.mode = mode; + } + + /** Stops and records the elapsed time based on emit mode setting. */ + public void stop() { + switch (mode) { + case EMIT_TIMERS_ONLY: + if (timerSW != null) { + timerSW.stop(); + } + break; + case EMIT_BOTH: + if (timerSW != null) { + timerSW.stop(); + } + if (histogramSW != null) { + histogramSW.stop(); + } + break; + case EMIT_HISTOGRAMS_ONLY: + if (histogramSW != null) { + histogramSW.stop(); + } + break; + } + } + } + + private MetricsEmit() { + // Utility class - prevent instantiation + } +} diff --git a/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java b/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java index debb1c780..08c4a44db 100644 --- a/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java +++ b/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java @@ -24,6 +24,8 @@ import com.uber.cadence.common.RetryOptions; import com.uber.cadence.internal.common.OptionsUtils; import com.uber.cadence.internal.common.RpcRetryer; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsTag; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.cadence.internal.replay.HistoryHelper.DecisionEvents; @@ -35,7 +37,6 @@ import com.uber.cadence.serviceclient.IWorkflowService; import com.uber.cadence.workflow.Functions; import com.uber.m3.tally.Scope; -import com.uber.m3.tally.Stopwatch; import com.uber.m3.util.ImmutableMap; import java.time.Duration; import java.util.ArrayList; @@ -299,7 +300,8 @@ private void completeWorkflow() { long nanoTime = TimeUnit.NANOSECONDS.convert(System.currentTimeMillis(), TimeUnit.MILLISECONDS); com.uber.m3.util.Duration d = com.uber.m3.util.Duration.ofNanos(nanoTime - wfStartTimeNanos); - metricsScope.timer(MetricsType.WORKFLOW_E2E_LATENCY).record(d); + MetricsEmit.emitLatency( + metricsScope, MetricsType.WORKFLOW_E2E_LATENCY, d, HistogramBuckets.HIGH_1MS_24H); } private void updateTimers() { @@ -667,7 +669,11 @@ public HistoryEvent next() { } metricsScope.counter(MetricsType.WORKFLOW_GET_HISTORY_COUNTER).inc(1); - Stopwatch sw = metricsScope.timer(MetricsType.WORKFLOW_GET_HISTORY_LATENCY).start(); + MetricsEmit.DualStopwatch sw = + MetricsEmit.startLatency( + metricsScope, + MetricsType.WORKFLOW_GET_HISTORY_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S); RetryOptions retryOptions = new RetryOptions.Builder() .setExpiration(decisionTaskRemainingTime) diff --git a/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java b/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java index 8b0ea6f95..55ce2dc8a 100644 --- a/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java +++ b/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java @@ -24,6 +24,8 @@ import com.uber.cadence.common.WorkflowExecutionHistory; import com.uber.cadence.internal.common.RpcRetryer; import com.uber.cadence.internal.common.WorkflowExecutionUtils; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.cadence.serviceclient.IWorkflowService; import com.uber.cadence.testing.TestEnvironmentOptions; @@ -32,7 +34,6 @@ import com.uber.cadence.worker.WorkflowImplementationOptions; import com.uber.cadence.workflow.Functions; import com.uber.m3.tally.Scope; -import com.uber.m3.tally.Stopwatch; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -197,7 +198,9 @@ protected WorkflowExecutionHistory getFullHistory(String domain, WorkflowExecuti protected boolean replayWorkflowHistory( String domain, WorkflowExecution execution, WorkflowExecutionHistory workflowHistory) throws Exception { - Stopwatch sw = this.metricsScope.timer(MetricsType.REPLAY_LATENCY).start(); + MetricsEmit.DualStopwatch sw = + MetricsEmit.startLatency( + this.metricsScope, MetricsType.REPLAY_LATENCY, HistogramBuckets.DEFAULT_1MS_100S); try { worker.replayWorkflowExecution(workflowHistory); } catch (Exception e) { diff --git a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java index 594f62138..f5a608417 100644 --- a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java +++ b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java @@ -22,10 +22,11 @@ import com.google.common.collect.ImmutableMap; import com.uber.cadence.*; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsTag; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.cadence.serviceclient.IWorkflowService; -import com.uber.m3.tally.Stopwatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,7 +48,11 @@ public ActivityPollTask( @Override protected PollForActivityTaskResponse pollTask() throws CadenceError { options.getMetricsScope().counter(MetricsType.ACTIVITY_POLL_COUNTER).inc(1); - Stopwatch sw = options.getMetricsScope().timer(MetricsType.ACTIVITY_POLL_LATENCY).start(); + MetricsEmit.DualStopwatch sw = + MetricsEmit.startLatency( + options.getMetricsScope(), + MetricsType.ACTIVITY_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S); PollForActivityTaskRequest pollRequest = new PollForActivityTaskRequest(); pollRequest.setDomain(domain); pollRequest.setIdentity(options.getIdentity()); diff --git a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java index bded97746..32d4fcba7 100644 --- a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java +++ b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java @@ -19,6 +19,8 @@ import com.uber.cadence.CadenceError; import com.uber.cadence.PollForActivityTaskResponse; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsTag; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.m3.tally.Scope; @@ -50,11 +52,12 @@ public PollForActivityTaskResponse poll() throws CadenceError { MetricsTag.WORKFLOW_TYPE, result.getWorkflowType().getName())); metricsScope.counter(MetricsType.ACTIVITY_POLL_SUCCEED_COUNTER).inc(1); - metricsScope - .timer(MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY) - .record( - Duration.ofNanos( - result.getStartedTimestamp() - result.getScheduledTimestampOfThisAttempt())); + MetricsEmit.emitLatency( + metricsScope, + MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY, + Duration.ofNanos( + result.getStartedTimestamp() - result.getScheduledTimestampOfThisAttempt()), + HistogramBuckets.HIGH_1MS_24H); return result; } diff --git a/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java b/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java index ecd6287c8..f05651e3a 100644 --- a/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java +++ b/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java @@ -21,6 +21,8 @@ import com.uber.cadence.context.ContextPropagator; import com.uber.cadence.internal.common.RpcRetryer; import com.uber.cadence.internal.logging.LoggerTag; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsTag; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.cadence.internal.tracing.TracingPropagator; @@ -28,7 +30,6 @@ import com.uber.cadence.internal.worker.Poller.PollTask; import com.uber.cadence.serviceclient.IWorkflowService; import com.uber.m3.tally.Scope; -import com.uber.m3.tally.Stopwatch; import com.uber.m3.util.Duration; import com.uber.m3.util.ImmutableMap; import io.opentracing.Span; @@ -127,11 +128,11 @@ public void handle(PollForActivityTaskResponse task) throws Exception { MetricsTag.WORKFLOW_TYPE, task.getWorkflowType().getName())); - metricsScope - .timer(MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY) - .record( - Duration.ofNanos( - task.getStartedTimestamp() - task.getScheduledTimestampOfThisAttempt())); + MetricsEmit.emitLatency( + metricsScope, + MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY, + Duration.ofNanos(task.getStartedTimestamp() - task.getScheduledTimestampOfThisAttempt()), + HistogramBuckets.HIGH_1MS_24H); // The following tags are for logging. MDC.put(LoggerTag.ACTIVITY_ID, task.getActivityId()); @@ -143,25 +144,35 @@ public void handle(PollForActivityTaskResponse task) throws Exception { propagateContext(task); Span span = spanFactory.spanForExecuteActivity(task); try (io.opentracing.Scope scope = tracer.activateSpan(span)) { - Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_EXEC_LATENCY).start(); + MetricsEmit.DualStopwatch sw = + MetricsEmit.startLatency( + metricsScope, MetricsType.ACTIVITY_EXEC_LATENCY, HistogramBuckets.HIGH_1MS_24H); ActivityTaskHandler.Result response = handler.handle(task, metricsScope, false); sw.stop(); - sw = metricsScope.timer(MetricsType.ACTIVITY_RESP_LATENCY).start(); + sw = + MetricsEmit.startLatency( + metricsScope, MetricsType.ACTIVITY_RESP_LATENCY, HistogramBuckets.DEFAULT_1MS_100S); sendReply(task, response, metricsScope); sw.stop(); long nanoTime = TimeUnit.NANOSECONDS.convert(System.currentTimeMillis(), TimeUnit.MILLISECONDS); Duration duration = Duration.ofNanos(nanoTime - task.getScheduledTimestampOfThisAttempt()); - metricsScope.timer(MetricsType.ACTIVITY_E2E_LATENCY).record(duration); + MetricsEmit.emitLatency( + metricsScope, + MetricsType.ACTIVITY_E2E_LATENCY, + duration, + HistogramBuckets.HIGH_1MS_24H); } catch (CancellationException e) { RespondActivityTaskCanceledRequest cancelledRequest = new RespondActivityTaskCanceledRequest(); cancelledRequest.setDetails( String.valueOf(e.getMessage()).getBytes(StandardCharsets.UTF_8)); - Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_RESP_LATENCY).start(); + MetricsEmit.DualStopwatch sw = + MetricsEmit.startLatency( + metricsScope, MetricsType.ACTIVITY_RESP_LATENCY, HistogramBuckets.DEFAULT_1MS_100S); sendReply(task, new Result(null, null, cancelledRequest), metricsScope); sw.stop(); } finally { diff --git a/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java b/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java index 01ffde191..3483e9e68 100644 --- a/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java +++ b/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java @@ -24,13 +24,14 @@ import com.uber.cadence.common.RetryOptions; import com.uber.cadence.context.ContextPropagator; import com.uber.cadence.internal.common.LocalActivityMarkerData; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsTag; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.cadence.internal.replay.ClockDecisionContext; import com.uber.cadence.internal.replay.ExecuteLocalActivityParameters; import com.uber.cadence.internal.tracing.TracingPropagator; import com.uber.m3.tally.Scope; -import com.uber.m3.tally.Stopwatch; import com.uber.m3.util.ImmutableMap; import io.opentracing.Span; import io.opentracing.Tracer; @@ -198,7 +199,11 @@ private ActivityTaskHandler.Result handleLocalActivity(Task task) throws Interru pollTask.setInput(task.params.getInput()); pollTask.setAttempt(task.params.getAttempt()); - Stopwatch sw = metricsScope.timer(MetricsType.LOCAL_ACTIVITY_EXECUTION_LATENCY).start(); + MetricsEmit.DualStopwatch sw = + MetricsEmit.startLatency( + metricsScope, + MetricsType.LOCAL_ACTIVITY_EXECUTION_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S); ActivityTaskHandler.Result result = handler.handle(pollTask, metricsScope, true); sw.stop(); result.setAttempt(task.params.getAttempt()); diff --git a/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java b/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java index 40cec1bed..b90921b20 100644 --- a/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java +++ b/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java @@ -22,11 +22,12 @@ import com.uber.cadence.*; import com.uber.cadence.common.BinaryChecksum; +import com.uber.cadence.internal.metrics.HistogramBuckets; +import com.uber.cadence.internal.metrics.MetricsEmit; import com.uber.cadence.internal.metrics.MetricsTag; import com.uber.cadence.internal.metrics.MetricsType; import com.uber.cadence.serviceclient.IWorkflowService; import com.uber.m3.tally.Scope; -import com.uber.m3.tally.Stopwatch; import com.uber.m3.util.Duration; import com.uber.m3.util.ImmutableMap; import java.util.Objects; @@ -61,7 +62,9 @@ final class WorkflowPollTask implements Poller.PollTask