diff --git a/src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java b/src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java new file mode 100644 index 000000000..8bf9390b4 --- /dev/null +++ b/src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java @@ -0,0 +1,207 @@ +/* + * Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Modifications copyright (C) 2017 Uber Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License is + * located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.uber.cadence.internal.metrics; + +import com.uber.m3.tally.DurationBuckets; +import com.uber.m3.util.Duration; +import java.util.concurrent.TimeUnit; + +/** + * Histogram bucket configurations for timer metrics migration. + * + *
This class defines standard histogram bucket configurations used during the migration from + * timers to histograms. These buckets provide consistent granularity for measuring latencies across + * different time ranges. + * + *
Note: Unlike the Go client which uses subsettable exponential histograms with algorithmic + * bucket generation, the Java client uses explicit bucket definitions. We provide multiple + * configurations to balance between granularity and cardinality: + * + *
Range: 1ms to 100s + * + *
Provides: - Fine-grained buckets (1ms steps) from 1ms to 10ms - Medium-grained buckets (10ms + * steps) from 10ms to 100ms - Coarser buckets (100ms steps) from 100ms to 1s - Second-level + * buckets from 1s to 100s + * + *
Use for: - Decision poll latency - Activity poll latency - Decision execution latency - + * Activity execution latency - Workflow replay latency - Most RPC call latencies + */ + public static final DurationBuckets DEFAULT_1MS_100S = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), // 1ms + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(3)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(4)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(6)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(7)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(8)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(9)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(40)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(60)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(70)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(80)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(90)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(300)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(400)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(600)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(700)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(800)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(900)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(3)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(4)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(6)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(7)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(8)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(9)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(40)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(70)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(80)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(90)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(100))); + + /** + * Low-resolution bucket configuration for high-cardinality metrics. + * + *
Range: 1ms to 100s (same as DEFAULT_1MS_100S but with fewer buckets) + * + *
Provides: - Coarser buckets with ~2x steps instead of fine-grained steps - Approximately + * half the cardinality of DEFAULT_1MS_100S + * + *
Use for: - Per-activity-type metrics where cardinality is high - Per-workflow-type metrics + * where cardinality is high - Metrics with many tag combinations + */ + public static final DurationBuckets LOW_1MS_100S = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(100))); + + /** + * High-resolution bucket configuration for long-running operations. + * + *
Range: 1ms to 24 hours + * + *
Provides: - Fine-grained buckets from 1ms to 10ms - Medium-grained from 10ms to 1s - + * Second-level buckets from 1s to 10 minutes - Minute-level buckets from 10 minutes to 24 hours + * + *
Use for: - Workflow end-to-end latency - Long-running activity execution latency - Multi-day + * operation metrics + */ + public static final DurationBuckets HIGH_1MS_24H = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(120)), // 2 min + Duration.ofNanos(TimeUnit.SECONDS.toNanos(300)), // 5 min + Duration.ofNanos(TimeUnit.SECONDS.toNanos(600)), // 10 min + Duration.ofNanos(TimeUnit.MINUTES.toNanos(20)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(1)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(2)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(4)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(8)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(12)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(24))); + + /** + * Medium-resolution bucket configuration for long-running operations. + * + *
Range: 1ms to 24 hours (same as HIGH_1MS_24H but with fewer buckets) + * + *
Provides: - Coarser buckets than HIGH_1MS_24H - Better for high-cardinality long-duration + * metrics + * + *
Use for: - When HIGH_1MS_24H's cardinality is too high - Per-workflow-type E2E latency with + * many workflow types + */ + public static final DurationBuckets MID_1MS_24H = + DurationBuckets.custom( + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)), + Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(1)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(5)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(10)), + Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(1)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(4)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(12)), + Duration.ofNanos(TimeUnit.HOURS.toNanos(24))); + + private HistogramBuckets() { + // Utility class - prevent instantiation + } +} diff --git a/src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md b/src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md new file mode 100644 index 000000000..547e619bb --- /dev/null +++ b/src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md @@ -0,0 +1,173 @@ +# Timer to Histogram Migration + +## Overview + +This document describes the migration from timer metrics to histogram metrics in the Cadence Java client. The migration uses a dual-emit pattern where **both timer and histogram metrics are always emitted**, allowing for gradual migration of dashboards and alerts without requiring a coordinated flag day. + +## Why Migrate? + +Timers and histograms serve similar purposes (measuring latencies and durations) but have different characteristics: + +- **Timers**: Legacy approach, currently used throughout the codebase +- **Histograms**: More flexible, better support for custom buckets and percentile calculations + +## Migration Strategy + +### Phase 1: Dual Emission (Current) + +Both timer and histogram metrics are emitted simultaneously: + +```java +// Old code: +Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start(); +// ... do work ... +sw.stop(); + +// New code (dual emit): +DualStopwatch sw = MetricsEmit.startLatency( + scope, + MetricsType.DECISION_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +); +// ... do work ... +sw.stop(); // Records to BOTH timer and histogram +``` + +### Phase 2: Dashboard/Alert Migration (Next) + +Update all dashboards and alerts to use histogram metrics instead of timer metrics. This can be done gradually since both are being emitted. + +### Phase 3: Remove Timer Emission (Future) + +Once all dashboards/alerts are migrated, remove timer emission: + +```java +// Future code (histogram only): +Stopwatch sw = scope.histogram( + MetricsType.DECISION_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +).start(); +// ... do work ... +sw.stop(); +``` + +## Helper Classes + +### HistogramBuckets + +Defines standard bucket configurations: + +- `DEFAULT_1MS_100S`: For most latency measurements (1ms to 100s range) + - Fine-grained: 1ms steps from 1-10ms + - Medium-grained: 10ms steps from 10-100ms + - Coarse: 100ms steps from 100ms-1s + - Second-level: 1s steps from 1-100s + - Use for: Most RPC calls, decision/activity poll, execution latencies + +- `LOW_1MS_100S`: Low-resolution version for high-cardinality metrics (1ms to 100s) + - Approximately half the buckets of DEFAULT_1MS_100S + - Use for: Per-activity-type, per-workflow-type metrics with high cardinality + +- `HIGH_1MS_24H`: For long-running operations (1ms to 24 hours) + - Extended range for multi-hour workflows + - Use for: Workflow end-to-end latency, long-running activities + +- `MID_1MS_24H`: Lower-resolution version of HIGH_1MS_24H + - Fewer buckets than HIGH_1MS_24H + - Use for: When HIGH_1MS_24H's cardinality is too high + +### MetricsEmit + +Provides dual-emit helper methods: + +- `emitLatency(scope, name, duration, buckets)`: Directly record a duration +- `startLatency(scope, name, buckets)`: Create a dual stopwatch + +### DualStopwatch + +A stopwatch wrapper that records to both timer and histogram when `.stop()` is called. + +## Migration Checklist + +For each timer metric: + +1. ✅ Identify the timer usage (e.g., `scope.timer(name).start()`) +2. ✅ Replace with `MetricsEmit.startLatency(scope, name, buckets)` +3. ✅ Choose appropriate bucket configuration (typically `HistogramBuckets.DEFAULT_1MS_100S`) +4. ✅ Verify both metrics are being emitted +5. ⏳ Update dashboards to use histogram metric +6. ⏳ Update alerts to use histogram metric +7. ⏳ (Future) Remove timer emission + +## Example Conversions + +### Example 1: Poll Latency + +```java +// Before: +Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start(); +PollForDecisionTaskResponse result = service.PollForDecisionTask(request); +sw.stop(); + +// After: +DualStopwatch sw = MetricsEmit.startLatency( + scope, + MetricsType.DECISION_POLL_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +); +PollForDecisionTaskResponse result = service.PollForDecisionTask(request); +sw.stop(); +``` + +### Example 2: Execution Latency + +```java +// Before: +Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_EXEC_LATENCY).start(); +Result response = handler.handle(task, metricsScope, false); +sw.stop(); + +// After: +DualStopwatch sw = MetricsEmit.startLatency( + metricsScope, + MetricsType.ACTIVITY_EXEC_LATENCY, + HistogramBuckets.DEFAULT_1MS_100S +); +Result response = handler.handle(task, metricsScope, false); +sw.stop(); +``` + +### Example 3: Direct Duration Recording + +```java +// Before: +Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime); +scope.timer(MetricsType.DECISION_SCHEDULED_TO_START_LATENCY).record(scheduledToStartLatency); + +// After: +Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime); +MetricsEmit.emitLatency( + scope, + MetricsType.DECISION_SCHEDULED_TO_START_LATENCY, + scheduledToStartLatency, + HistogramBuckets.DEFAULT_1MS_100S +); +``` + +## Testing + +The migration preserves existing timer behavior while adding histogram emission, so: + +- Existing timer-based tests continue to work +- Existing timer-based dashboards/alerts continue to work +- New histogram metrics are available for gradual migration + +## Timeline + +1. **Now**: Dual emission in place, both metrics available +2. **Next Quarter**: Migrate dashboards and alerts to histograms +3. **Future Release**: Remove timer emission, histogram-only + +## Questions? + +Contact the Cadence team for guidance on specific metrics or migration questions. diff --git a/src/main/java/com/uber/cadence/internal/metrics/MetricsEmit.java b/src/main/java/com/uber/cadence/internal/metrics/MetricsEmit.java new file mode 100644 index 000000000..184409a4e --- /dev/null +++ b/src/main/java/com/uber/cadence/internal/metrics/MetricsEmit.java @@ -0,0 +1,214 @@ +/* + * Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Modifications copyright (C) 2017 Uber Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License is + * located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.uber.cadence.internal.metrics; + +import com.uber.m3.tally.DurationBuckets; +import com.uber.m3.tally.Scope; +import com.uber.m3.tally.Stopwatch; +import com.uber.m3.util.Duration; + +/** + * Helper utilities for dual-emitting metrics during timer to histogram migration. + * + *
This class provides utilities to support gradual migration from timer metrics to histogram + * metrics. By default, both timer and histogram metrics are emitted to support gradual + * dashboard/alert migration without requiring a flag day. + * + *
Migration path: 1. Use these helpers to emit both timers and histograms (default behavior) 2. + * Update dashboards/alerts to use histogram metrics 3. In a future release, remove timer emission + * and use histograms exclusively + * + *
Example usage: + * + *
{@code
+ * // Direct latency recording
+ * Duration latency = Duration.ofMillis(150);
+ * MetricsEmit.emitLatency(scope, MetricsType.DECISION_POLL_LATENCY, latency, HistogramBuckets.DEFAULT_1MS_100S);
+ *
+ * // Using stopwatch
+ * DualStopwatch sw = MetricsEmit.startLatency(scope, MetricsType.ACTIVITY_EXEC_LATENCY, HistogramBuckets.DEFAULT_1MS_100S);
+ * // ... do work ...
+ * sw.stop();
+ *
+ * // Configure emission mode (optional, typically done at application startup)
+ * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_HISTOGRAMS_ONLY);
+ * }
+ */
+public final class MetricsEmit {
+
+ /** Metric emission mode controls which metrics are emitted for latency measurements. */
+ public enum MetricEmitMode {
+ /** Emit only timer metrics (legacy OSS behavior) */
+ EMIT_TIMERS_ONLY,
+ /** Emit both timer and histogram metrics (default for migration) */
+ EMIT_BOTH,
+ /** Emit only histogram metrics (post-migration) */
+ EMIT_HISTOGRAMS_ONLY
+ }
+
+ /**
+ * Current emission mode. Default is EMIT_BOTH for migration. This should be set during
+ * application initialization (e.g., in static initializer or before starting workers). It should
+ * NOT be changed dynamically after workers have started.
+ */
+ private static volatile MetricEmitMode currentEmitMode = MetricEmitMode.EMIT_BOTH;
+
+ /**
+ * Configures the metric emission strategy. This should be called during application
+ * initialization, before any metrics are emitted.
+ *
+ * @param mode The emission mode to use
+ * Example usage: + *
{@code
+ * // To use only timers (legacy behavior)
+ * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_TIMERS_ONLY);
+ *
+ * // To use both (default, for migration)
+ * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_BOTH);
+ *
+ * // To use only histograms (post-migration)
+ * MetricsEmit.setEmitMode(MetricEmitMode.EMIT_HISTOGRAMS_ONLY);
+ * }
+ */
+ public static void setEmitMode(MetricEmitMode mode) {
+ if (mode == null) {
+ throw new IllegalArgumentException("MetricEmitMode cannot be null");
+ }
+ currentEmitMode = mode;
+ }
+
+ /**
+ * Returns the current emission mode.
+ *
+ * @return The current emission mode
+ */
+ public static MetricEmitMode getEmitMode() {
+ return currentEmitMode;
+ }
+
+ /**
+ * Records latency based on the current emit mode setting.
+ *
+ * This helper function supports flexible metric emission during timer→histogram migration. The + * behavior depends on the current emit mode: + * + *
Call .stop() on the returned stopwatch to record the duration. The behavior depends on the + * current emit mode. + * + * @param scope The tally scope to emit metrics to + * @param name The metric name (without suffix) + * @param buckets The histogram bucket configuration to use + * @return A dual stopwatch that records based on emit mode + */ + public static DualStopwatch startLatency(Scope scope, String name, DurationBuckets buckets) { + MetricEmitMode mode = currentEmitMode; + Stopwatch timerSW = null; + Stopwatch histogramSW = null; + + switch (mode) { + case EMIT_TIMERS_ONLY: + timerSW = scope.timer(name).start(); + break; + case EMIT_BOTH: + timerSW = scope.timer(name).start(); + histogramSW = scope.histogram(name, buckets).start(); + break; + case EMIT_HISTOGRAMS_ONLY: + histogramSW = scope.histogram(name, buckets).start(); + break; + } + + return new DualStopwatch(timerSW, histogramSW, mode); + } + + /** + * A stopwatch that emits metrics based on the emit mode setting. + * + *
This supports flexible metric emission during timer→histogram migration. The metrics emitted
+ * depend on the mode captured when the stopwatch was started.
+ */
+ public static class DualStopwatch {
+ private final Stopwatch timerSW;
+ private final Stopwatch histogramSW;
+ private final MetricEmitMode mode;
+
+ DualStopwatch(Stopwatch timerSW, Stopwatch histogramSW, MetricEmitMode mode) {
+ this.timerSW = timerSW;
+ this.histogramSW = histogramSW;
+ this.mode = mode;
+ }
+
+ /** Stops and records the elapsed time based on emit mode setting. */
+ public void stop() {
+ switch (mode) {
+ case EMIT_TIMERS_ONLY:
+ if (timerSW != null) {
+ timerSW.stop();
+ }
+ break;
+ case EMIT_BOTH:
+ if (timerSW != null) {
+ timerSW.stop();
+ }
+ if (histogramSW != null) {
+ histogramSW.stop();
+ }
+ break;
+ case EMIT_HISTOGRAMS_ONLY:
+ if (histogramSW != null) {
+ histogramSW.stop();
+ }
+ break;
+ }
+ }
+ }
+
+ private MetricsEmit() {
+ // Utility class - prevent instantiation
+ }
+}
diff --git a/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java b/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java
index debb1c780..08c4a44db 100644
--- a/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java
+++ b/src/main/java/com/uber/cadence/internal/replay/ReplayDecider.java
@@ -24,6 +24,8 @@
import com.uber.cadence.common.RetryOptions;
import com.uber.cadence.internal.common.OptionsUtils;
import com.uber.cadence.internal.common.RpcRetryer;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsTag;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.cadence.internal.replay.HistoryHelper.DecisionEvents;
@@ -35,7 +37,6 @@
import com.uber.cadence.serviceclient.IWorkflowService;
import com.uber.cadence.workflow.Functions;
import com.uber.m3.tally.Scope;
-import com.uber.m3.tally.Stopwatch;
import com.uber.m3.util.ImmutableMap;
import java.time.Duration;
import java.util.ArrayList;
@@ -299,7 +300,8 @@ private void completeWorkflow() {
long nanoTime = TimeUnit.NANOSECONDS.convert(System.currentTimeMillis(), TimeUnit.MILLISECONDS);
com.uber.m3.util.Duration d = com.uber.m3.util.Duration.ofNanos(nanoTime - wfStartTimeNanos);
- metricsScope.timer(MetricsType.WORKFLOW_E2E_LATENCY).record(d);
+ MetricsEmit.emitLatency(
+ metricsScope, MetricsType.WORKFLOW_E2E_LATENCY, d, HistogramBuckets.HIGH_1MS_24H);
}
private void updateTimers() {
@@ -667,7 +669,11 @@ public HistoryEvent next() {
}
metricsScope.counter(MetricsType.WORKFLOW_GET_HISTORY_COUNTER).inc(1);
- Stopwatch sw = metricsScope.timer(MetricsType.WORKFLOW_GET_HISTORY_LATENCY).start();
+ MetricsEmit.DualStopwatch sw =
+ MetricsEmit.startLatency(
+ metricsScope,
+ MetricsType.WORKFLOW_GET_HISTORY_LATENCY,
+ HistogramBuckets.DEFAULT_1MS_100S);
RetryOptions retryOptions =
new RetryOptions.Builder()
.setExpiration(decisionTaskRemainingTime)
diff --git a/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java b/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java
index 8b0ea6f95..55ce2dc8a 100644
--- a/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java
+++ b/src/main/java/com/uber/cadence/internal/shadowing/ReplayWorkflowActivityImpl.java
@@ -24,6 +24,8 @@
import com.uber.cadence.common.WorkflowExecutionHistory;
import com.uber.cadence.internal.common.RpcRetryer;
import com.uber.cadence.internal.common.WorkflowExecutionUtils;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.cadence.serviceclient.IWorkflowService;
import com.uber.cadence.testing.TestEnvironmentOptions;
@@ -32,7 +34,6 @@
import com.uber.cadence.worker.WorkflowImplementationOptions;
import com.uber.cadence.workflow.Functions;
import com.uber.m3.tally.Scope;
-import com.uber.m3.tally.Stopwatch;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
@@ -197,7 +198,9 @@ protected WorkflowExecutionHistory getFullHistory(String domain, WorkflowExecuti
protected boolean replayWorkflowHistory(
String domain, WorkflowExecution execution, WorkflowExecutionHistory workflowHistory)
throws Exception {
- Stopwatch sw = this.metricsScope.timer(MetricsType.REPLAY_LATENCY).start();
+ MetricsEmit.DualStopwatch sw =
+ MetricsEmit.startLatency(
+ this.metricsScope, MetricsType.REPLAY_LATENCY, HistogramBuckets.DEFAULT_1MS_100S);
try {
worker.replayWorkflowExecution(workflowHistory);
} catch (Exception e) {
diff --git a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java
index 594f62138..f5a608417 100644
--- a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java
+++ b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTask.java
@@ -22,10 +22,11 @@
import com.google.common.collect.ImmutableMap;
import com.uber.cadence.*;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsTag;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.cadence.serviceclient.IWorkflowService;
-import com.uber.m3.tally.Stopwatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -47,7 +48,11 @@ public ActivityPollTask(
@Override
protected PollForActivityTaskResponse pollTask() throws CadenceError {
options.getMetricsScope().counter(MetricsType.ACTIVITY_POLL_COUNTER).inc(1);
- Stopwatch sw = options.getMetricsScope().timer(MetricsType.ACTIVITY_POLL_LATENCY).start();
+ MetricsEmit.DualStopwatch sw =
+ MetricsEmit.startLatency(
+ options.getMetricsScope(),
+ MetricsType.ACTIVITY_POLL_LATENCY,
+ HistogramBuckets.DEFAULT_1MS_100S);
PollForActivityTaskRequest pollRequest = new PollForActivityTaskRequest();
pollRequest.setDomain(domain);
pollRequest.setIdentity(options.getIdentity());
diff --git a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java
index bded97746..32d4fcba7 100644
--- a/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java
+++ b/src/main/java/com/uber/cadence/internal/worker/ActivityPollTaskBase.java
@@ -19,6 +19,8 @@
import com.uber.cadence.CadenceError;
import com.uber.cadence.PollForActivityTaskResponse;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsTag;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.m3.tally.Scope;
@@ -50,11 +52,12 @@ public PollForActivityTaskResponse poll() throws CadenceError {
MetricsTag.WORKFLOW_TYPE,
result.getWorkflowType().getName()));
metricsScope.counter(MetricsType.ACTIVITY_POLL_SUCCEED_COUNTER).inc(1);
- metricsScope
- .timer(MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY)
- .record(
- Duration.ofNanos(
- result.getStartedTimestamp() - result.getScheduledTimestampOfThisAttempt()));
+ MetricsEmit.emitLatency(
+ metricsScope,
+ MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY,
+ Duration.ofNanos(
+ result.getStartedTimestamp() - result.getScheduledTimestampOfThisAttempt()),
+ HistogramBuckets.HIGH_1MS_24H);
return result;
}
diff --git a/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java b/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java
index ecd6287c8..f05651e3a 100644
--- a/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java
+++ b/src/main/java/com/uber/cadence/internal/worker/ActivityWorker.java
@@ -21,6 +21,8 @@
import com.uber.cadence.context.ContextPropagator;
import com.uber.cadence.internal.common.RpcRetryer;
import com.uber.cadence.internal.logging.LoggerTag;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsTag;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.cadence.internal.tracing.TracingPropagator;
@@ -28,7 +30,6 @@
import com.uber.cadence.internal.worker.Poller.PollTask;
import com.uber.cadence.serviceclient.IWorkflowService;
import com.uber.m3.tally.Scope;
-import com.uber.m3.tally.Stopwatch;
import com.uber.m3.util.Duration;
import com.uber.m3.util.ImmutableMap;
import io.opentracing.Span;
@@ -127,11 +128,11 @@ public void handle(PollForActivityTaskResponse task) throws Exception {
MetricsTag.WORKFLOW_TYPE,
task.getWorkflowType().getName()));
- metricsScope
- .timer(MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY)
- .record(
- Duration.ofNanos(
- task.getStartedTimestamp() - task.getScheduledTimestampOfThisAttempt()));
+ MetricsEmit.emitLatency(
+ metricsScope,
+ MetricsType.ACTIVITY_SCHEDULED_TO_START_LATENCY,
+ Duration.ofNanos(task.getStartedTimestamp() - task.getScheduledTimestampOfThisAttempt()),
+ HistogramBuckets.HIGH_1MS_24H);
// The following tags are for logging.
MDC.put(LoggerTag.ACTIVITY_ID, task.getActivityId());
@@ -143,25 +144,35 @@ public void handle(PollForActivityTaskResponse task) throws Exception {
propagateContext(task);
Span span = spanFactory.spanForExecuteActivity(task);
try (io.opentracing.Scope scope = tracer.activateSpan(span)) {
- Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_EXEC_LATENCY).start();
+ MetricsEmit.DualStopwatch sw =
+ MetricsEmit.startLatency(
+ metricsScope, MetricsType.ACTIVITY_EXEC_LATENCY, HistogramBuckets.HIGH_1MS_24H);
ActivityTaskHandler.Result response = handler.handle(task, metricsScope, false);
sw.stop();
- sw = metricsScope.timer(MetricsType.ACTIVITY_RESP_LATENCY).start();
+ sw =
+ MetricsEmit.startLatency(
+ metricsScope, MetricsType.ACTIVITY_RESP_LATENCY, HistogramBuckets.DEFAULT_1MS_100S);
sendReply(task, response, metricsScope);
sw.stop();
long nanoTime =
TimeUnit.NANOSECONDS.convert(System.currentTimeMillis(), TimeUnit.MILLISECONDS);
Duration duration = Duration.ofNanos(nanoTime - task.getScheduledTimestampOfThisAttempt());
- metricsScope.timer(MetricsType.ACTIVITY_E2E_LATENCY).record(duration);
+ MetricsEmit.emitLatency(
+ metricsScope,
+ MetricsType.ACTIVITY_E2E_LATENCY,
+ duration,
+ HistogramBuckets.HIGH_1MS_24H);
} catch (CancellationException e) {
RespondActivityTaskCanceledRequest cancelledRequest =
new RespondActivityTaskCanceledRequest();
cancelledRequest.setDetails(
String.valueOf(e.getMessage()).getBytes(StandardCharsets.UTF_8));
- Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_RESP_LATENCY).start();
+ MetricsEmit.DualStopwatch sw =
+ MetricsEmit.startLatency(
+ metricsScope, MetricsType.ACTIVITY_RESP_LATENCY, HistogramBuckets.DEFAULT_1MS_100S);
sendReply(task, new Result(null, null, cancelledRequest), metricsScope);
sw.stop();
} finally {
diff --git a/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java b/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java
index 01ffde191..3483e9e68 100644
--- a/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java
+++ b/src/main/java/com/uber/cadence/internal/worker/LocalActivityWorker.java
@@ -24,13 +24,14 @@
import com.uber.cadence.common.RetryOptions;
import com.uber.cadence.context.ContextPropagator;
import com.uber.cadence.internal.common.LocalActivityMarkerData;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsTag;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.cadence.internal.replay.ClockDecisionContext;
import com.uber.cadence.internal.replay.ExecuteLocalActivityParameters;
import com.uber.cadence.internal.tracing.TracingPropagator;
import com.uber.m3.tally.Scope;
-import com.uber.m3.tally.Stopwatch;
import com.uber.m3.util.ImmutableMap;
import io.opentracing.Span;
import io.opentracing.Tracer;
@@ -198,7 +199,11 @@ private ActivityTaskHandler.Result handleLocalActivity(Task task) throws Interru
pollTask.setInput(task.params.getInput());
pollTask.setAttempt(task.params.getAttempt());
- Stopwatch sw = metricsScope.timer(MetricsType.LOCAL_ACTIVITY_EXECUTION_LATENCY).start();
+ MetricsEmit.DualStopwatch sw =
+ MetricsEmit.startLatency(
+ metricsScope,
+ MetricsType.LOCAL_ACTIVITY_EXECUTION_LATENCY,
+ HistogramBuckets.DEFAULT_1MS_100S);
ActivityTaskHandler.Result result = handler.handle(pollTask, metricsScope, true);
sw.stop();
result.setAttempt(task.params.getAttempt());
diff --git a/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java b/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java
index 40cec1bed..b90921b20 100644
--- a/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java
+++ b/src/main/java/com/uber/cadence/internal/worker/WorkflowPollTask.java
@@ -22,11 +22,12 @@
import com.uber.cadence.*;
import com.uber.cadence.common.BinaryChecksum;
+import com.uber.cadence.internal.metrics.HistogramBuckets;
+import com.uber.cadence.internal.metrics.MetricsEmit;
import com.uber.cadence.internal.metrics.MetricsTag;
import com.uber.cadence.internal.metrics.MetricsType;
import com.uber.cadence.serviceclient.IWorkflowService;
import com.uber.m3.tally.Scope;
-import com.uber.m3.tally.Stopwatch;
import com.uber.m3.util.Duration;
import com.uber.m3.util.ImmutableMap;
import java.util.Objects;
@@ -61,7 +62,9 @@ final class WorkflowPollTask implements Poller.PollTask