diff --git a/client/templates/metered.tmpl b/client/templates/metered.tmpl index 9b57880b077..8cd386292bd 100644 --- a/client/templates/metered.tmpl +++ b/client/templates/metered.tmpl @@ -1,6 +1,7 @@ import ( "context" "strings" + "time" "go.uber.org/yarpc" "github.com/uber/cadence/common/constants" @@ -44,9 +45,11 @@ func (c *{{$decorator}}) {{$method.Declaration}} { c.emitForwardedFromStats(scope, {{(index $method.Params 1).Name}}) {{ end }} + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) {{$method.ResultsNames}} = c.client.{{$method.Call}} sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/client/wrappers/metered/admin_generated.go b/client/wrappers/metered/admin_generated.go index 10e9d28e873..b80eefebc5c 100644 --- a/client/wrappers/metered/admin_generated.go +++ b/client/wrappers/metered/admin_generated.go @@ -6,6 +6,7 @@ package metered import ( "context" + "time" "go.uber.org/yarpc" @@ -40,9 +41,11 @@ func (c *adminClient) AddSearchAttribute(ctx context.Context, ap1 *types.AddSear scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.AddSearchAttribute(ctx, ap1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -62,9 +65,11 @@ func (c *adminClient) CloseShard(ctx context.Context, cp1 *types.CloseShardReque scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.CloseShard(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -84,9 +89,11 @@ func (c *adminClient) CountDLQMessages(ctx context.Context, cp1 *types.CountDLQM scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) cp2, err = c.client.CountDLQMessages(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -106,9 +113,11 @@ func (c *adminClient) DeleteWorkflow(ctx context.Context, ap1 *types.AdminDelete scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) ap2, err = c.client.DeleteWorkflow(ctx, ap1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -128,9 +137,11 @@ func (c *adminClient) DescribeCluster(ctx context.Context, p1 ...yarpc.CallOptio scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp1, err = c.client.DescribeCluster(ctx, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -150,9 +161,11 @@ func (c *adminClient) DescribeHistoryHost(ctx context.Context, dp1 *types.Descri scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeHistoryHost(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -172,9 +185,11 @@ func (c *adminClient) DescribeQueue(ctx context.Context, dp1 *types.DescribeQueu scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeQueue(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -194,9 +209,11 @@ func (c *adminClient) DescribeShardDistribution(ctx context.Context, dp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeShardDistribution(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -216,9 +233,11 @@ func (c *adminClient) DescribeWorkflowExecution(ctx context.Context, ap1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) ap2, err = c.client.DescribeWorkflowExecution(ctx, ap1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -238,9 +257,11 @@ func (c *adminClient) GetDLQReplicationMessages(ctx context.Context, gp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetDLQReplicationMessages(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -260,9 +281,11 @@ func (c *adminClient) GetDomainAsyncWorkflowConfiguraton(ctx context.Context, re scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp1, err = c.client.GetDomainAsyncWorkflowConfiguraton(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -282,9 +305,11 @@ func (c *adminClient) GetDomainIsolationGroups(ctx context.Context, request *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp1, err = c.client.GetDomainIsolationGroups(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -304,9 +329,11 @@ func (c *adminClient) GetDomainReplicationMessages(ctx context.Context, gp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetDomainReplicationMessages(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -326,9 +353,11 @@ func (c *adminClient) GetDynamicConfig(ctx context.Context, gp1 *types.GetDynami scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetDynamicConfig(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -348,9 +377,11 @@ func (c *adminClient) GetGlobalIsolationGroups(ctx context.Context, request *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp1, err = c.client.GetGlobalIsolationGroups(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -370,9 +401,11 @@ func (c *adminClient) GetReplicationMessages(ctx context.Context, gp1 *types.Get scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetReplicationMessages(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -392,9 +425,11 @@ func (c *adminClient) GetWorkflowExecutionRawHistoryV2(ctx context.Context, gp1 scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetWorkflowExecutionRawHistoryV2(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -414,9 +449,11 @@ func (c *adminClient) ListDynamicConfig(ctx context.Context, lp1 *types.ListDyna scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListDynamicConfig(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -436,9 +473,11 @@ func (c *adminClient) MaintainCorruptWorkflow(ctx context.Context, ap1 *types.Ad scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) ap2, err = c.client.MaintainCorruptWorkflow(ctx, ap1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -458,9 +497,11 @@ func (c *adminClient) MergeDLQMessages(ctx context.Context, mp1 *types.MergeDLQM scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.MergeDLQMessages(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -480,9 +521,11 @@ func (c *adminClient) PurgeDLQMessages(ctx context.Context, pp1 *types.PurgeDLQM scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.PurgeDLQMessages(ctx, pp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -502,9 +545,11 @@ func (c *adminClient) ReadDLQMessages(ctx context.Context, rp1 *types.ReadDLQMes scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.ReadDLQMessages(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -524,9 +569,11 @@ func (c *adminClient) ReapplyEvents(ctx context.Context, rp1 *types.ReapplyEvent scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ReapplyEvents(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -546,9 +593,11 @@ func (c *adminClient) RefreshWorkflowTasks(ctx context.Context, rp1 *types.Refre scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RefreshWorkflowTasks(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -568,9 +617,11 @@ func (c *adminClient) RemoveTask(ctx context.Context, rp1 *types.RemoveTaskReque scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RemoveTask(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -590,9 +641,11 @@ func (c *adminClient) ResendReplicationTasks(ctx context.Context, rp1 *types.Res scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ResendReplicationTasks(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -612,9 +665,11 @@ func (c *adminClient) ResetQueue(ctx context.Context, rp1 *types.ResetQueueReque scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ResetQueue(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -634,9 +689,11 @@ func (c *adminClient) RestoreDynamicConfig(ctx context.Context, rp1 *types.Resto scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RestoreDynamicConfig(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -656,9 +713,11 @@ func (c *adminClient) UpdateDomainAsyncWorkflowConfiguraton(ctx context.Context, scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up1, err = c.client.UpdateDomainAsyncWorkflowConfiguraton(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -678,9 +737,11 @@ func (c *adminClient) UpdateDomainIsolationGroups(ctx context.Context, request * scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up1, err = c.client.UpdateDomainIsolationGroups(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -700,9 +761,11 @@ func (c *adminClient) UpdateDynamicConfig(ctx context.Context, up1 *types.Update scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.UpdateDynamicConfig(ctx, up1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -722,9 +785,11 @@ func (c *adminClient) UpdateGlobalIsolationGroups(ctx context.Context, request * scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up1, err = c.client.UpdateGlobalIsolationGroups(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -744,9 +809,11 @@ func (c *adminClient) UpdateTaskListPartitionConfig(ctx context.Context, request scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up1, err = c.client.UpdateTaskListPartitionConfig(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/client/wrappers/metered/frontend_generated.go b/client/wrappers/metered/frontend_generated.go index f5062763d6f..ad39b122fb2 100644 --- a/client/wrappers/metered/frontend_generated.go +++ b/client/wrappers/metered/frontend_generated.go @@ -6,6 +6,7 @@ package metered import ( "context" + "time" "go.uber.org/yarpc" @@ -40,9 +41,11 @@ func (c *frontendClient) BackfillSchedule(ctx context.Context, bp1 *types.Backfi scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) bp2, err = c.client.BackfillSchedule(ctx, bp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -62,9 +65,11 @@ func (c *frontendClient) CountWorkflowExecutions(ctx context.Context, cp1 *types scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) cp2, err = c.client.CountWorkflowExecutions(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -84,9 +89,11 @@ func (c *frontendClient) CreateSchedule(ctx context.Context, cp1 *types.CreateSc scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) cp2, err = c.client.CreateSchedule(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -106,9 +113,11 @@ func (c *frontendClient) DeleteDomain(ctx context.Context, dp1 *types.DeleteDoma scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.DeleteDomain(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -128,9 +137,11 @@ func (c *frontendClient) DeleteSchedule(ctx context.Context, dp1 *types.DeleteSc scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DeleteSchedule(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -150,9 +161,11 @@ func (c *frontendClient) DeprecateDomain(ctx context.Context, dp1 *types.Depreca scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.DeprecateDomain(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -172,9 +185,11 @@ func (c *frontendClient) DescribeDomain(ctx context.Context, dp1 *types.Describe scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeDomain(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -194,9 +209,11 @@ func (c *frontendClient) DescribeSchedule(ctx context.Context, dp1 *types.Descri scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeSchedule(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -216,9 +233,11 @@ func (c *frontendClient) DescribeTaskList(ctx context.Context, dp1 *types.Descri scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeTaskList(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -238,9 +257,11 @@ func (c *frontendClient) DescribeWorkflowExecution(ctx context.Context, dp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeWorkflowExecution(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -260,9 +281,11 @@ func (c *frontendClient) DiagnoseWorkflowExecution(ctx context.Context, dp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DiagnoseWorkflowExecution(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -282,9 +305,11 @@ func (c *frontendClient) FailoverDomain(ctx context.Context, fp1 *types.Failover scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) fp2, err = c.client.FailoverDomain(ctx, fp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -304,9 +329,11 @@ func (c *frontendClient) GetClusterInfo(ctx context.Context, p1 ...yarpc.CallOpt scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) cp1, err = c.client.GetClusterInfo(ctx, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -326,9 +353,11 @@ func (c *frontendClient) GetSearchAttributes(ctx context.Context, p1 ...yarpc.Ca scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp1, err = c.client.GetSearchAttributes(ctx, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -348,9 +377,11 @@ func (c *frontendClient) GetTaskListsByDomain(ctx context.Context, gp1 *types.Ge scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetTaskListsByDomain(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -370,9 +401,11 @@ func (c *frontendClient) GetWorkflowExecutionHistory(ctx context.Context, gp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetWorkflowExecutionHistory(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -392,9 +425,11 @@ func (c *frontendClient) ListArchivedWorkflowExecutions(ctx context.Context, lp1 scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListArchivedWorkflowExecutions(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -414,9 +449,11 @@ func (c *frontendClient) ListClosedWorkflowExecutions(ctx context.Context, lp1 * scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListClosedWorkflowExecutions(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -436,9 +473,11 @@ func (c *frontendClient) ListDomains(ctx context.Context, lp1 *types.ListDomains scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListDomains(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -458,9 +497,11 @@ func (c *frontendClient) ListFailoverHistory(ctx context.Context, lp1 *types.Lis scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListFailoverHistory(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -480,9 +521,11 @@ func (c *frontendClient) ListOpenWorkflowExecutions(ctx context.Context, lp1 *ty scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListOpenWorkflowExecutions(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -502,9 +545,11 @@ func (c *frontendClient) ListSchedules(ctx context.Context, lp1 *types.ListSched scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListSchedules(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -524,9 +569,11 @@ func (c *frontendClient) ListTaskListPartitions(ctx context.Context, lp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListTaskListPartitions(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -546,9 +593,11 @@ func (c *frontendClient) ListWorkflowExecutions(ctx context.Context, lp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ListWorkflowExecutions(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -568,9 +617,11 @@ func (c *frontendClient) PauseSchedule(ctx context.Context, pp1 *types.PauseSche scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) pp2, err = c.client.PauseSchedule(ctx, pp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -590,9 +641,11 @@ func (c *frontendClient) PollForActivityTask(ctx context.Context, pp1 *types.Pol scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) pp2, err = c.client.PollForActivityTask(ctx, pp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -612,9 +665,11 @@ func (c *frontendClient) PollForDecisionTask(ctx context.Context, pp1 *types.Pol scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) pp2, err = c.client.PollForDecisionTask(ctx, pp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -634,9 +689,11 @@ func (c *frontendClient) QueryWorkflow(ctx context.Context, qp1 *types.QueryWork scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) qp2, err = c.client.QueryWorkflow(ctx, qp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -656,9 +713,11 @@ func (c *frontendClient) RecordActivityTaskHeartbeat(ctx context.Context, rp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RecordActivityTaskHeartbeat(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -678,9 +737,11 @@ func (c *frontendClient) RecordActivityTaskHeartbeatByID(ctx context.Context, rp scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RecordActivityTaskHeartbeatByID(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -700,9 +761,11 @@ func (c *frontendClient) RefreshWorkflowTasks(ctx context.Context, rp1 *types.Re scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RefreshWorkflowTasks(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -722,9 +785,11 @@ func (c *frontendClient) RegisterDomain(ctx context.Context, rp1 *types.Register scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RegisterDomain(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -744,9 +809,11 @@ func (c *frontendClient) RequestCancelWorkflowExecution(ctx context.Context, rp1 scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RequestCancelWorkflowExecution(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -766,9 +833,11 @@ func (c *frontendClient) ResetStickyTaskList(ctx context.Context, rp1 *types.Res scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.ResetStickyTaskList(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -788,9 +857,11 @@ func (c *frontendClient) ResetWorkflowExecution(ctx context.Context, rp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.ResetWorkflowExecution(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -810,9 +881,11 @@ func (c *frontendClient) RespondActivityTaskCanceled(ctx context.Context, rp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskCanceled(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -832,9 +905,11 @@ func (c *frontendClient) RespondActivityTaskCanceledByID(ctx context.Context, rp scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskCanceledByID(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -854,9 +929,11 @@ func (c *frontendClient) RespondActivityTaskCompleted(ctx context.Context, rp1 * scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskCompleted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -876,9 +953,11 @@ func (c *frontendClient) RespondActivityTaskCompletedByID(ctx context.Context, r scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskCompletedByID(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -898,9 +977,11 @@ func (c *frontendClient) RespondActivityTaskFailed(ctx context.Context, rp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskFailed(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -920,9 +1001,11 @@ func (c *frontendClient) RespondActivityTaskFailedByID(ctx context.Context, rp1 scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskFailedByID(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -942,9 +1025,11 @@ func (c *frontendClient) RespondDecisionTaskCompleted(ctx context.Context, rp1 * scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RespondDecisionTaskCompleted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -964,9 +1049,11 @@ func (c *frontendClient) RespondDecisionTaskFailed(ctx context.Context, rp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondDecisionTaskFailed(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -986,9 +1073,11 @@ func (c *frontendClient) RespondQueryTaskCompleted(ctx context.Context, rp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondQueryTaskCompleted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1008,9 +1097,11 @@ func (c *frontendClient) RestartWorkflowExecution(ctx context.Context, rp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RestartWorkflowExecution(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1030,9 +1121,11 @@ func (c *frontendClient) ScanWorkflowExecutions(ctx context.Context, lp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp2, err = c.client.ScanWorkflowExecutions(ctx, lp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1052,9 +1145,11 @@ func (c *frontendClient) SignalWithStartWorkflowExecution(ctx context.Context, s scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) sp2, err = c.client.SignalWithStartWorkflowExecution(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1074,9 +1169,11 @@ func (c *frontendClient) SignalWithStartWorkflowExecutionAsync(ctx context.Conte scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) sp2, err = c.client.SignalWithStartWorkflowExecutionAsync(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1096,9 +1193,11 @@ func (c *frontendClient) SignalWorkflowExecution(ctx context.Context, sp1 *types scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.SignalWorkflowExecution(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1118,9 +1217,11 @@ func (c *frontendClient) StartWorkflowExecution(ctx context.Context, sp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) sp2, err = c.client.StartWorkflowExecution(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1140,9 +1241,11 @@ func (c *frontendClient) StartWorkflowExecutionAsync(ctx context.Context, sp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) sp2, err = c.client.StartWorkflowExecutionAsync(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1162,9 +1265,11 @@ func (c *frontendClient) TerminateWorkflowExecution(ctx context.Context, tp1 *ty scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.TerminateWorkflowExecution(ctx, tp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1184,9 +1289,11 @@ func (c *frontendClient) UnpauseSchedule(ctx context.Context, up1 *types.Unpause scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up2, err = c.client.UnpauseSchedule(ctx, up1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1206,9 +1313,11 @@ func (c *frontendClient) UpdateDomain(ctx context.Context, up1 *types.UpdateDoma scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up2, err = c.client.UpdateDomain(ctx, up1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -1228,9 +1337,11 @@ func (c *frontendClient) UpdateSchedule(ctx context.Context, up1 *types.UpdateSc scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) up2, err = c.client.UpdateSchedule(ctx, up1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/client/wrappers/metered/history_generated.go b/client/wrappers/metered/history_generated.go index ee392ce92e8..a1ca3060a8e 100644 --- a/client/wrappers/metered/history_generated.go +++ b/client/wrappers/metered/history_generated.go @@ -6,6 +6,7 @@ package metered import ( "context" + "time" "go.uber.org/yarpc" @@ -40,9 +41,11 @@ func (c *historyClient) CloseShard(ctx context.Context, cp1 *types.CloseShardReq scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.CloseShard(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -62,9 +65,11 @@ func (c *historyClient) CountDLQMessages(ctx context.Context, cp1 *types.CountDL scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) hp1, err = c.client.CountDLQMessages(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -84,9 +89,11 @@ func (c *historyClient) DescribeHistoryHost(ctx context.Context, dp1 *types.Desc scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeHistoryHost(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -106,9 +113,11 @@ func (c *historyClient) DescribeMutableState(ctx context.Context, dp1 *types.Des scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeMutableState(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -128,9 +137,11 @@ func (c *historyClient) DescribeQueue(ctx context.Context, dp1 *types.DescribeQu scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp2, err = c.client.DescribeQueue(ctx, dp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -150,9 +161,11 @@ func (c *historyClient) DescribeWorkflowExecution(ctx context.Context, hp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp1, err = c.client.DescribeWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -172,9 +185,11 @@ func (c *historyClient) GetCrossClusterTasks(ctx context.Context, gp1 *types.Get scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetCrossClusterTasks(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -194,9 +209,11 @@ func (c *historyClient) GetDLQReplicationMessages(ctx context.Context, gp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetDLQReplicationMessages(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -216,9 +233,11 @@ func (c *historyClient) GetFailoverInfo(ctx context.Context, gp1 *types.GetFailo scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetFailoverInfo(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -238,9 +257,11 @@ func (c *historyClient) GetMutableState(ctx context.Context, gp1 *types.GetMutab scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetMutableState(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -260,9 +281,11 @@ func (c *historyClient) GetReplicationMessages(ctx context.Context, gp1 *types.G scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetReplicationMessages(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -282,9 +305,11 @@ func (c *historyClient) MergeDLQMessages(ctx context.Context, mp1 *types.MergeDL scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.MergeDLQMessages(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -304,9 +329,11 @@ func (c *historyClient) NotifyFailoverMarkers(ctx context.Context, np1 *types.No scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.NotifyFailoverMarkers(ctx, np1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -326,9 +353,11 @@ func (c *historyClient) PollMutableState(ctx context.Context, pp1 *types.PollMut scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) pp2, err = c.client.PollMutableState(ctx, pp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -348,9 +377,11 @@ func (c *historyClient) PurgeDLQMessages(ctx context.Context, pp1 *types.PurgeDL scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.PurgeDLQMessages(ctx, pp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -370,9 +401,11 @@ func (c *historyClient) QueryWorkflow(ctx context.Context, hp1 *types.HistoryQue scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) hp2, err = c.client.QueryWorkflow(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -392,9 +425,11 @@ func (c *historyClient) RatelimitUpdate(ctx context.Context, request *types.Rate scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp1, err = c.client.RatelimitUpdate(ctx, request, opts...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -414,9 +449,11 @@ func (c *historyClient) ReadDLQMessages(ctx context.Context, rp1 *types.ReadDLQM scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.ReadDLQMessages(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -436,9 +473,11 @@ func (c *historyClient) ReapplyEvents(ctx context.Context, hp1 *types.HistoryRea scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ReapplyEvents(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -458,9 +497,11 @@ func (c *historyClient) RecordActivityTaskHeartbeat(ctx context.Context, hp1 *ty scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp1, err = c.client.RecordActivityTaskHeartbeat(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -480,9 +521,11 @@ func (c *historyClient) RecordActivityTaskStarted(ctx context.Context, rp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RecordActivityTaskStarted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -502,9 +545,11 @@ func (c *historyClient) RecordChildExecutionCompleted(ctx context.Context, rp1 * scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RecordChildExecutionCompleted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -524,9 +569,11 @@ func (c *historyClient) RecordDecisionTaskStarted(ctx context.Context, rp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RecordDecisionTaskStarted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -546,9 +593,11 @@ func (c *historyClient) RefreshWorkflowTasks(ctx context.Context, hp1 *types.His scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RefreshWorkflowTasks(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -568,9 +617,11 @@ func (c *historyClient) RemoveSignalMutableState(ctx context.Context, rp1 *types scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RemoveSignalMutableState(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -590,9 +641,11 @@ func (c *historyClient) RemoveTask(ctx context.Context, rp1 *types.RemoveTaskReq scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RemoveTask(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -612,9 +665,11 @@ func (c *historyClient) ReplicateEventsV2(ctx context.Context, rp1 *types.Replic scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ReplicateEventsV2(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -634,9 +689,11 @@ func (c *historyClient) RequestCancelWorkflowExecution(ctx context.Context, hp1 scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RequestCancelWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -656,9 +713,11 @@ func (c *historyClient) ResetQueue(ctx context.Context, rp1 *types.ResetQueueReq scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ResetQueue(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -678,9 +737,11 @@ func (c *historyClient) ResetStickyTaskList(ctx context.Context, hp1 *types.Hist scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) hp2, err = c.client.ResetStickyTaskList(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -700,9 +761,11 @@ func (c *historyClient) ResetWorkflowExecution(ctx context.Context, hp1 *types.H scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp1, err = c.client.ResetWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -722,9 +785,11 @@ func (c *historyClient) RespondActivityTaskCanceled(ctx context.Context, hp1 *ty scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskCanceled(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -744,9 +809,11 @@ func (c *historyClient) RespondActivityTaskCompleted(ctx context.Context, hp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskCompleted(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -766,9 +833,11 @@ func (c *historyClient) RespondActivityTaskFailed(ctx context.Context, hp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondActivityTaskFailed(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -788,9 +857,11 @@ func (c *historyClient) RespondCrossClusterTasksCompleted(ctx context.Context, r scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) rp2, err = c.client.RespondCrossClusterTasksCompleted(ctx, rp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -810,9 +881,11 @@ func (c *historyClient) RespondDecisionTaskCompleted(ctx context.Context, hp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) hp2, err = c.client.RespondDecisionTaskCompleted(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -832,9 +905,11 @@ func (c *historyClient) RespondDecisionTaskFailed(ctx context.Context, hp1 *type scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondDecisionTaskFailed(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -854,9 +929,11 @@ func (c *historyClient) ScheduleDecisionTask(ctx context.Context, sp1 *types.Sch scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.ScheduleDecisionTask(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -876,9 +953,11 @@ func (c *historyClient) SignalWithStartWorkflowExecution(ctx context.Context, hp scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) sp1, err = c.client.SignalWithStartWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -898,9 +977,11 @@ func (c *historyClient) SignalWorkflowExecution(ctx context.Context, hp1 *types. scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.SignalWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -920,9 +1001,11 @@ func (c *historyClient) StartWorkflowExecution(ctx context.Context, hp1 *types.H scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) sp1, err = c.client.StartWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -942,9 +1025,11 @@ func (c *historyClient) SyncActivity(ctx context.Context, sp1 *types.SyncActivit scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.SyncActivity(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -964,9 +1049,11 @@ func (c *historyClient) SyncShardStatus(ctx context.Context, sp1 *types.SyncShar scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.SyncShardStatus(ctx, sp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -986,9 +1073,11 @@ func (c *historyClient) TerminateWorkflowExecution(ctx context.Context, hp1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.TerminateWorkflowExecution(ctx, hp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/client/wrappers/metered/matching_generated.go b/client/wrappers/metered/matching_generated.go index 76d0efeb6cc..a26519ccb85 100644 --- a/client/wrappers/metered/matching_generated.go +++ b/client/wrappers/metered/matching_generated.go @@ -7,6 +7,7 @@ package metered import ( "context" "strings" + "time" "go.uber.org/yarpc" @@ -43,9 +44,11 @@ func (c *matchingClient) AddActivityTask(ctx context.Context, ap1 *types.AddActi scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, ap1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) ap2, err = c.client.AddActivityTask(ctx, ap1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -66,9 +69,11 @@ func (c *matchingClient) AddDecisionTask(ctx context.Context, ap1 *types.AddDeci scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, ap1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) ap2, err = c.client.AddDecisionTask(ctx, ap1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -89,9 +94,11 @@ func (c *matchingClient) CancelOutstandingPoll(ctx context.Context, cp1 *types.C scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, cp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.CancelOutstandingPoll(ctx, cp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -112,9 +119,11 @@ func (c *matchingClient) DescribeTaskList(ctx context.Context, mp1 *types.Matchi scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) dp1, err = c.client.DescribeTaskList(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -135,9 +144,11 @@ func (c *matchingClient) GetTaskListsByDomain(ctx context.Context, gp1 *types.Ge scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, gp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetTaskListsByDomain(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -158,9 +169,11 @@ func (c *matchingClient) ListTaskListPartitions(ctx context.Context, mp1 *types. scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) lp1, err = c.client.ListTaskListPartitions(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -181,9 +194,11 @@ func (c *matchingClient) PollForActivityTask(ctx context.Context, mp1 *types.Mat scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.PollForActivityTask(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -204,9 +219,11 @@ func (c *matchingClient) PollForDecisionTask(ctx context.Context, mp1 *types.Mat scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.PollForDecisionTask(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -227,9 +244,11 @@ func (c *matchingClient) QueryWorkflow(ctx context.Context, mp1 *types.MatchingQ scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.QueryWorkflow(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -250,9 +269,11 @@ func (c *matchingClient) RefreshTaskListPartitionConfig(ctx context.Context, mp1 scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.RefreshTaskListPartitionConfig(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -273,9 +294,11 @@ func (c *matchingClient) RespondQueryTaskCompleted(ctx context.Context, mp1 *typ scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) err = c.client.RespondQueryTaskCompleted(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -296,9 +319,11 @@ func (c *matchingClient) UpdateTaskListPartitionConfig(ctx context.Context, mp1 scope.IncCounter(metrics.CadenceClientRequests) c.emitForwardedFromStats(scope, mp1) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) mp2, err = c.client.UpdateTaskListPartitionConfig(ctx, mp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/client/wrappers/metered/sharddistributor_generated.go b/client/wrappers/metered/sharddistributor_generated.go index be0475d9f4a..d8c8028392b 100644 --- a/client/wrappers/metered/sharddistributor_generated.go +++ b/client/wrappers/metered/sharddistributor_generated.go @@ -6,6 +6,7 @@ package metered import ( "context" + "time" "go.uber.org/yarpc" @@ -40,9 +41,11 @@ func (c *sharddistributorClient) GetShardOwner(ctx context.Context, gp1 *types.G scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) gp2, err = c.client.GetShardOwner(ctx, gp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) @@ -62,9 +65,11 @@ func (c *sharddistributorClient) WatchNamespaceState(ctx context.Context, wp1 *t scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) w1, err = c.client.WatchNamespaceState(ctx, wp1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/client/wrappers/metered/sharddistributorexecutor_generated.go b/client/wrappers/metered/sharddistributorexecutor_generated.go index 316cd0bf821..f90394dfc7e 100644 --- a/client/wrappers/metered/sharddistributorexecutor_generated.go +++ b/client/wrappers/metered/sharddistributorexecutor_generated.go @@ -6,6 +6,7 @@ package metered import ( "context" + "time" "go.uber.org/yarpc" @@ -40,9 +41,11 @@ func (c *sharddistributorexecutorClient) Heartbeat(ctx context.Context, ep1 *typ scope.IncCounter(metrics.CadenceClientRequests) + clientLatencyStart := time.Now() sw := scope.StartTimer(metrics.CadenceClientLatency) ep2, err = c.client.Heartbeat(ctx, ep1, p1...) sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(clientLatencyStart)) if err != nil { scope.IncCounter(metrics.CadenceClientFailures) diff --git a/common/asyncworkflow/queue/consumer/default_consumer.go b/common/asyncworkflow/queue/consumer/default_consumer.go index 63a4b22c26d..58a615263b1 100644 --- a/common/asyncworkflow/queue/consumer/default_consumer.go +++ b/common/asyncworkflow/queue/consumer/default_consumer.go @@ -156,8 +156,12 @@ func (c *DefaultConsumer) processMessage(msg messaging.Message) { logger := c.logger.WithTags(tag.Dynamic("partition", msg.Partition()), tag.Dynamic("offset", msg.Offset())) logger.Debug("Received message") + asyncProcessStart := time.Now() sw := c.scope.StartTimer(metrics.AsyncWorkflowProcessMsgLatency) - defer sw.Stop() + defer func() { + sw.Stop() + c.scope.RecordHistogramDuration(metrics.AsyncWorkflowProcessMsgLatencyHistogram, time.Since(asyncProcessStart)) + }() var request sqlblobs.AsyncRequestMessage if err := c.msgDecoder.Decode(msg.Value(), &request); err != nil { diff --git a/common/metrics/config.go b/common/metrics/config.go index 70ce0cc27cd..267a7363c83 100644 --- a/common/metrics/config.go +++ b/common/metrics/config.go @@ -265,6 +265,89 @@ var HistogramMigrationMetrics = map[string]struct{}{ "direct_query_dispatch_non_sticky_latency_ns": {}, "direct_query_dispatch_clear_stickiness_latency": {}, "direct_query_dispatch_clear_stickiness_latency_ns": {}, + "cadence_authorization_latency": {}, + "cadence_authorization_latency_ns": {}, + + "pinot_latency": {}, + "pinot_latency_ns": {}, + "pinot_latency_per_domain": {}, + "pinot_latency_per_domain_ns": {}, + + "sequentialtask_submit_latency": {}, + "sequentialtask_submit_latency_ns": {}, + "sequentialtask_queue_size": {}, + "sequentialtask_queue_size_counts": {}, + "sequentialtask_queue_processing_latency": {}, + "sequentialtask_queue_processing_latency_ns": {}, + "sequentialtask_task_processing_latency": {}, + "sequentialtask_task_processing_latency_ns": {}, + + "prioritytask_submit_latency": {}, + "prioritytask_submit_latency_ns": {}, + + "graceful_failover_latency": {}, + "graceful_failover_latency_ns": {}, + + "async_request_payload_size_per_domain": {}, + "async_request_payload_size_per_domain_counts": {}, + + "task_redispatch_queue_pending_tasks": {}, + "task_redispatch_queue_pending_tasks_counts": {}, + + "workflow_context_lock_latency": {}, + "workflow_context_lock_latency_ns": {}, + + "get_replication_messages_for_shard": {}, + "get_replication_messages_for_shard_ns": {}, + "get_dlq_replication_messages": {}, + "get_dlq_replication_messages_ns": {}, + + "decision_task_query_latency": {}, + "decision_task_query_latency_ns": {}, + + "syncmatch_latency_per_tl": {}, + "syncmatch_latency_per_tl_ns": {}, + "asyncmatch_latency_per_tl": {}, + "asyncmatch_latency_per_tl_ns": {}, + + "asyncmatch_local_poll_attempt_per_tl": {}, + "asyncmatch_local_poll_attempt_per_tl_counts": {}, + "asyncmatch_forward_poll_attempt_per_tl": {}, + "asyncmatch_forward_poll_attempt_per_tl_counts": {}, + "asyncmatch_local_poll_after_forward_failed_attempt_per_tl": {}, + "asyncmatch_local_poll_after_forward_failed_attempt_per_tl_counts": {}, + + "poll_local_match_latency_per_tl": {}, + "poll_local_match_latency_per_tl_ns": {}, + "poll_forward_match_latency_per_tl": {}, + "poll_forward_match_latency_per_tl_ns": {}, + "poll_local_match_after_forward_failed_latency_per_tl": {}, + "poll_local_match_after_forward_failed_latency_per_tl_ns": {}, + + "es_processor_process_msg_latency": {}, + "es_processor_process_msg_latency_ns": {}, + "index_processor_process_msg_latency": {}, + "index_processor_process_msg_latency_ns": {}, + + "async_workflow_process_msg_latency": {}, + "async_workflow_process_msg_latency_ns": {}, + "diagnostics_workflow_execution_latency": {}, + "diagnostics_workflow_execution_latency_ns": {}, + + "shard_distributor_latency": {}, + "shard_distributor_latency_ns": {}, + + "global_ratelimiter_update_latency": {}, + "global_ratelimiter_update_latency_ns": {}, + + "cadence_latency": {}, + "cadence_latency_ns": {}, + "cadence_client_latency": {}, + "cadence_client_latency_ns": {}, + "cadence_client_latency_redirection": {}, + "cadence_client_latency_redirection_ns": {}, + "cadence_latency_per_tl": {}, + "cadence_latency_per_tl_ns": {}, } func (h HistogramMigration) EmitTimer(name string) bool { diff --git a/common/metrics/defs.go b/common/metrics/defs.go index 1269e7e6719..a5d48882a50 100644 --- a/common/metrics/defs.go +++ b/common/metrics/defs.go @@ -2282,6 +2282,7 @@ const ( CadenceRequests MetricIdx = iota CadenceFailures CadenceLatency + CadenceLatencyHistogram CadenceErrBadRequestCounter CadenceErrDomainNotActiveCounter CadenceErrServiceBusyCounter @@ -2366,14 +2367,17 @@ const ( CadenceClientRequests CadenceClientFailures CadenceClientLatency + CadenceClientLatencyHistogram CadenceTasklistRequests CadenceDcRedirectionClientRequests CadenceDcRedirectionClientFailures CadenceDcRedirectionClientLatency + CadenceDcRedirectionClientLatencyHistogram CadenceAuthorizationLatency + CadenceAuthorizationLatencyHistogram DomainCachePrepareCallbacksLatency DomainCachePrepareCallbacksLatencyHistogram @@ -2412,11 +2416,13 @@ const ( PinotRequests PinotFailures PinotLatency + PinotLatencyHistogram PinotErrBadRequestCounter PinotErrBusyCounter PinotRequestsPerDomain PinotFailuresPerDomain PinotLatencyPerDomain + PinotLatencyPerDomainHistogram PinotErrBadRequestCounterPerDomain PinotErrBusyCounterPerDomain @@ -2424,9 +2430,13 @@ const ( SequentialTaskSubmitRequestTaskQueueExist SequentialTaskSubmitRequestTaskQueueMissing SequentialTaskSubmitLatency + SequentialTaskSubmitLatencyHistogram SequentialTaskQueueSize + SequentialTaskQueueSizeHistogram SequentialTaskQueueProcessingLatency + SequentialTaskQueueProcessingLatencyHistogram SequentialTaskTaskProcessingLatency + SequentialTaskTaskProcessingLatencyHistogram ParallelTaskSubmitRequest ParallelTaskSubmitLatency @@ -2436,6 +2446,7 @@ const ( PriorityTaskSubmitRequest PriorityTaskSubmitLatency + PriorityTaskSubmitLatencyHistogram KafkaConsumerMessageIn KafkaConsumerMessageAck @@ -2447,6 +2458,7 @@ const ( DescribeWorkflowStatusError GracefulFailoverLatency + GracefulFailoverLatencyHistogram GracefulFailoverFailure HistoryArchiverArchiveNonRetryableErrorCount @@ -2481,6 +2493,7 @@ const ( CadenceRequestsPerTaskListWithoutRollup CadenceFailuresPerTaskList CadenceLatencyPerTaskList + CadenceLatencyPerTaskListHistogram CadenceErrBadRequestPerTaskListCounter CadenceErrDomainNotActivePerTaskListCounter CadenceErrServiceBusyPerTaskListCounter @@ -2518,15 +2531,17 @@ const ( HashringViewIdentifier AsyncRequestPayloadSize + AsyncRequestPayloadSizeHistogram // limiter-side metrics GlobalRatelimiterStartupUsageHistogram GlobalRatelimiterFailingUsageHistogram GlobalRatelimiterGlobalUsageHistogram - GlobalRatelimiterUpdateLatency // time spent performing all Update requests, per batch attempt. ideally well below update interval. - GlobalRatelimiterAllowedRequestsCount // per key/type usage - GlobalRatelimiterRejectedRequestsCount // per key/type usage - GlobalRatelimiterQuota // per-global-key quota information, emitted when a key is in us + GlobalRatelimiterUpdateLatency // time spent performing all Update requests, per batch attempt. ideally well below update interval. + GlobalRatelimiterUpdateLatencyHistogram // histogram version + GlobalRatelimiterAllowedRequestsCount // per key/type usage + GlobalRatelimiterRejectedRequestsCount // per key/type usage + GlobalRatelimiterQuota // per-global-key quota information, emitted when a key is in us // aggregator-side metrics GlobalRatelimiterInitialized @@ -2640,6 +2655,7 @@ const ( TaskSchedulerThrottledCounterPerDomain TaskRedispatchQueuePendingTasksTimer + TaskRedispatchQueuePendingTasksHistogram TransferTaskThrottledCounter TimerTaskThrottledCounter @@ -2786,6 +2802,7 @@ const ( AcquireLockFailedCounter WorkflowContextCleared WorkflowContextLockLatency + WorkflowContextLockLatencyHistogram MutableStateSize MutableStateSizeHistogram ExecutionInfoSize @@ -2902,7 +2919,9 @@ const ( ReplicationDLQValidationFailed ReplicationMessageTooLargePerShard GetReplicationMessagesForShardLatency + GetReplicationMessagesForShardLatencyHistogram GetDLQReplicationMessagesLatency + GetDLQReplicationMessagesLatencyHistogram EventReapplySkippedCount DirectQueryDispatchLatency DirectQueryDispatchLatencyHistogram @@ -2917,6 +2936,7 @@ const ( DirectQueryDispatchClearStickinessSuccessCount DirectQueryDispatchTimeoutBeforeNonStickyCount DecisionTaskQueryLatency + DecisionTaskQueryLatencyHistogram ConsistentQueryPerShard ConsistentQueryTimeoutCount QueryBeforeFirstDecisionCount @@ -2941,7 +2961,6 @@ const ( WorkflowRepairTimeout WorkflowRepairDuration FailoverMarkerCount - FailoverMarkerReplicationLatency FailoverMarkerInsertFailure FailoverMarkerNotificationFailure FailoverMarkerUpdateShardFailure @@ -2990,8 +3009,9 @@ const ( BufferIsolationGroupRedirectFailureCounter BufferIsolationGroupMisconfiguredCounter SyncMatchLatencyPerTaskList + SyncMatchLatencyPerTaskListHistogram AsyncMatchLatencyPerTaskList - AsyncMatchDispatchLatencyPerTaskList + AsyncMatchLatencyPerTaskListHistogram AsyncMatchDispatchTimeoutCounterPerTaskList ExpiredTasksPerTaskListCounter ForwardedPerTaskListCounter @@ -3024,16 +3044,22 @@ const ( SyncMatchForwardPollLatencyPerTaskList AsyncMatchLocalPollCounterPerTaskList AsyncMatchLocalPollAttemptPerTaskList + AsyncMatchLocalPollAttemptPerTaskListHistogram AsyncMatchLocalPollLatencyPerTaskList AsyncMatchForwardPollCounterPerTaskList AsyncMatchForwardPollAttemptPerTaskList + AsyncMatchForwardPollAttemptPerTaskListHistogram AsyncMatchForwardPollLatencyPerTaskList AsyncMatchLocalPollAfterForwardFailedCounterPerTaskList AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskList + AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskListHistogram AsyncMatchLocalPollAfterForwardFailedLatencyPerTaskList PollLocalMatchLatencyPerTaskList + PollLocalMatchLatencyPerTaskListHistogram PollForwardMatchLatencyPerTaskList + PollForwardMatchLatencyPerTaskListHistogram PollLocalMatchAfterForwardFailedLatencyPerTaskList + PollLocalMatchAfterForwardFailedLatencyPerTaskListHistogram PollDecisionTaskAlreadyStartedCounterPerTaskList PollActivityTaskAlreadyStartedCounterPerTaskList TaskListReadWritePartitionMismatchGauge @@ -3070,8 +3096,10 @@ const ( ESProcessorFailures ESProcessorCorruptedData ESProcessorProcessMsgLatency + ESProcessorProcessMsgLatencyHistogram IndexProcessorCorruptedData IndexProcessorProcessMsgLatency + IndexProcessorProcessMsgLatencyHistogram ArchiverNonRetryableErrorCount ArchiverStartedCount ArchiverStoppedCount @@ -3138,12 +3166,14 @@ const ( ESAnalyzerNumLongRunningWorkflows AsyncWorkflowConsumerCount AsyncWorkflowProcessMsgLatency + AsyncWorkflowProcessMsgLatencyHistogram AsyncWorkflowFailureCorruptMsgCount AsyncWorkflowFailureByFrontendCount AsyncWorkflowSuccessCount DiagnosticsWorkflowStartedCount DiagnosticsWorkflowSuccess DiagnosticsWorkflowExecutionLatency + DiagnosticsWorkflowExecutionLatencyHistogram // Scheduler worker metrics // SchedulerWorkerActiveGauge is the number of per-domain workers running on this host (host-level) @@ -3187,6 +3217,7 @@ const ( ShardDistributorRequests = iota + NumWorkerMetrics ShardDistributorFailures ShardDistributorLatency + ShardDistributorLatencyHistogram ShardDistributorErrContextTimeoutCounter ShardDistributorErrNamespaceNotFound ShardDistributorErrShardNotFound @@ -3237,6 +3268,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ CadenceRequests: {metricName: "cadence_requests", metricType: Counter}, CadenceFailures: {metricName: "cadence_errors", metricType: Counter}, CadenceLatency: {metricName: "cadence_latency", metricType: Timer}, + CadenceLatencyHistogram: {metricName: "cadence_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, CadenceErrBadRequestCounter: {metricName: "cadence_errors_bad_request", metricType: Counter}, CadenceErrDomainNotActiveCounter: {metricName: "cadence_errors_domain_not_active", metricType: Counter}, CadenceErrServiceBusyCounter: {metricName: "cadence_errors_service_busy", metricType: Counter}, @@ -3317,11 +3349,14 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ CadenceClientRequests: {metricName: "cadence_client_requests", metricType: Counter}, CadenceClientFailures: {metricName: "cadence_client_errors", metricType: Counter}, CadenceClientLatency: {metricName: "cadence_client_latency", metricType: Timer}, + CadenceClientLatencyHistogram: {metricName: "cadence_client_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, CadenceTasklistRequests: {metricName: "cadence_tasklist_request", metricType: Counter}, CadenceDcRedirectionClientRequests: {metricName: "cadence_client_requests_redirection", metricType: Counter}, CadenceDcRedirectionClientFailures: {metricName: "cadence_client_errors_redirection", metricType: Counter}, CadenceDcRedirectionClientLatency: {metricName: "cadence_client_latency_redirection", metricType: Timer}, + CadenceDcRedirectionClientLatencyHistogram: {metricName: "cadence_client_latency_redirection_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, CadenceAuthorizationLatency: {metricName: "cadence_authorization_latency", metricType: Timer}, + CadenceAuthorizationLatencyHistogram: {metricName: "cadence_authorization_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, DomainCachePrepareCallbacksLatency: {metricName: "domain_cache_prepare_callbacks_latency", metricType: Timer}, DomainCachePrepareCallbacksLatencyHistogram: {metricName: "domain_cache_prepare_callbacks_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, DomainCacheCallbacksLatency: {metricName: "domain_cache_callbacks_latency", metricType: Timer}, @@ -3353,20 +3388,26 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ PinotRequests: {metricName: "pinot_requests", metricType: Counter}, PinotFailures: {metricName: "pinot_errors", metricType: Counter}, PinotLatency: {metricName: "pinot_latency", metricType: Timer}, + PinotLatencyHistogram: {metricName: "pinot_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, PinotErrBadRequestCounter: {metricName: "pinot_errors_bad_request", metricType: Counter}, PinotErrBusyCounter: {metricName: "pinot_errors_busy", metricType: Counter}, PinotRequestsPerDomain: {metricName: "pinot_requests_per_domain", metricRollupName: "pinot_requests", metricType: Counter}, PinotFailuresPerDomain: {metricName: "pinot_errors_per_domain", metricRollupName: "pinot_errors", metricType: Counter}, PinotLatencyPerDomain: {metricName: "pinot_latency_per_domain", metricRollupName: "pinot_latency", metricType: Timer}, + PinotLatencyPerDomainHistogram: {metricName: "pinot_latency_per_domain_ns", metricRollupName: "pinot_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, PinotErrBadRequestCounterPerDomain: {metricName: "pinot_errors_bad_request_per_domain", metricRollupName: "pinot_errors_bad_request", metricType: Counter}, PinotErrBusyCounterPerDomain: {metricName: "pinot_errors_busy_per_domain", metricRollupName: "pinot_errors_busy", metricType: Counter}, SequentialTaskSubmitRequest: {metricName: "sequentialtask_submit_request", metricType: Counter}, SequentialTaskSubmitRequestTaskQueueExist: {metricName: "sequentialtask_submit_request_taskqueue_exist", metricType: Counter}, SequentialTaskSubmitRequestTaskQueueMissing: {metricName: "sequentialtask_submit_request_taskqueue_missing", metricType: Counter}, SequentialTaskSubmitLatency: {metricName: "sequentialtask_submit_latency", metricType: Timer}, + SequentialTaskSubmitLatencyHistogram: {metricName: "sequentialtask_submit_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, SequentialTaskQueueSize: {metricName: "sequentialtask_queue_size", metricType: Timer}, + SequentialTaskQueueSizeHistogram: {metricName: "sequentialtask_queue_size_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k}, SequentialTaskQueueProcessingLatency: {metricName: "sequentialtask_queue_processing_latency", metricType: Timer}, + SequentialTaskQueueProcessingLatencyHistogram: {metricName: "sequentialtask_queue_processing_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, SequentialTaskTaskProcessingLatency: {metricName: "sequentialtask_task_processing_latency", metricType: Timer}, + SequentialTaskTaskProcessingLatencyHistogram: {metricName: "sequentialtask_task_processing_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, ParallelTaskSubmitRequest: {metricName: "paralleltask_submit_request", metricType: Counter}, ParallelTaskSubmitLatency: {metricName: "paralleltask_submit_latency", metricType: Timer}, ParallelTaskSubmitLatencyHistogram: {metricName: "paralleltask_submit_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, @@ -3374,12 +3415,14 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ ParallelTaskTaskProcessingLatencyHistogram: {metricName: "paralleltask_task_processing_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, PriorityTaskSubmitRequest: {metricName: "prioritytask_submit_request", metricType: Counter}, PriorityTaskSubmitLatency: {metricName: "prioritytask_submit_latency", metricType: Timer}, + PriorityTaskSubmitLatencyHistogram: {metricName: "prioritytask_submit_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, KafkaConsumerMessageIn: {metricName: "kafka_consumer_message_in", metricType: Counter}, KafkaConsumerMessageAck: {metricName: "kafka_consumer_message_ack", metricType: Counter}, KafkaConsumerMessageNack: {metricName: "kafka_consumer_message_nack", metricType: Counter}, KafkaConsumerMessageNackDlqErr: {metricName: "kafka_consumer_message_nack_dlq_err", metricType: Counter}, KafkaConsumerSessionStart: {metricName: "kafka_consumer_session_start", metricType: Counter}, GracefulFailoverLatency: {metricName: "graceful_failover_latency", metricType: Timer}, + GracefulFailoverLatencyHistogram: {metricName: "graceful_failover_latency_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h}, GracefulFailoverFailure: {metricName: "graceful_failover_failures", metricType: Counter}, HistoryArchiverArchiveNonRetryableErrorCount: {metricName: "history_archiver_archive_non_retryable_error", metricType: Counter}, @@ -3417,6 +3460,9 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ CadenceLatencyPerTaskList: { metricName: "cadence_latency_per_tl", metricRollupName: "cadence_latency", metricType: Timer, }, + CadenceLatencyPerTaskListHistogram: { + metricName: "cadence_latency_per_tl_ns", metricRollupName: "cadence_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s, + }, CadenceErrBadRequestPerTaskListCounter: { metricName: "cadence_errors_bad_request_per_tl", metricRollupName: "cadence_errors_bad_request", metricType: Counter, }, @@ -3493,15 +3539,17 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ DescribeWorkflowStatusError: {metricName: "describe_wf_error", metricType: Counter}, DescribeWorkflowStatusCount: {metricName: "describe_wf_status", metricType: Counter}, - AsyncRequestPayloadSize: {metricName: "async_request_payload_size_per_domain", metricRollupName: "async_request_payload_size", metricType: Timer}, + AsyncRequestPayloadSize: {metricName: "async_request_payload_size_per_domain", metricRollupName: "async_request_payload_size", metricType: Timer}, + AsyncRequestPayloadSizeHistogram: {metricName: "async_request_payload_size_per_domain_counts", metricRollupName: "async_request_payload_size_counts", metricType: Histogram, intExponentialBuckets: Mid8B16MB}, - GlobalRatelimiterStartupUsageHistogram: {metricName: "global_ratelimiter_startup_usage_histogram", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, - GlobalRatelimiterFailingUsageHistogram: {metricName: "global_ratelimiter_failing_usage_histogram", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, - GlobalRatelimiterGlobalUsageHistogram: {metricName: "global_ratelimiter_global_usage_histogram", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, - GlobalRatelimiterUpdateLatency: {metricName: "global_ratelimiter_update_latency", metricType: Timer}, - GlobalRatelimiterAllowedRequestsCount: {metricName: "global_ratelimiter_allowed_requests", metricType: Counter}, - GlobalRatelimiterRejectedRequestsCount: {metricName: "global_ratelimiter_rejected_requests", metricType: Counter}, - GlobalRatelimiterQuota: {metricName: "global_ratelimiter_quota", metricType: Gauge}, + GlobalRatelimiterStartupUsageHistogram: {metricName: "global_ratelimiter_startup_usage_histogram", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, + GlobalRatelimiterFailingUsageHistogram: {metricName: "global_ratelimiter_failing_usage_histogram", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, + GlobalRatelimiterGlobalUsageHistogram: {metricName: "global_ratelimiter_global_usage_histogram", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, + GlobalRatelimiterUpdateLatency: {metricName: "global_ratelimiter_update_latency", metricType: Timer}, + GlobalRatelimiterUpdateLatencyHistogram: {metricName: "global_ratelimiter_update_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, + GlobalRatelimiterAllowedRequestsCount: {metricName: "global_ratelimiter_allowed_requests", metricType: Counter}, + GlobalRatelimiterRejectedRequestsCount: {metricName: "global_ratelimiter_rejected_requests", metricType: Counter}, + GlobalRatelimiterQuota: {metricName: "global_ratelimiter_quota", metricType: Gauge}, GlobalRatelimiterInitialized: {metricName: "global_ratelimiter_initialized", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, GlobalRatelimiterReinitialized: {metricName: "global_ratelimiter_reinitialized", metricType: Histogram, buckets: GlobalRatelimiterUsageHistogram}, @@ -3614,6 +3662,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ TaskBatchCompleteCounter: {metricName: "task_batch_complete_counter", metricType: Counter}, TaskBatchCompleteFailure: {metricName: "task_batch_complete_error", metricType: Counter}, TaskRedispatchQueuePendingTasksTimer: {metricName: "task_redispatch_queue_pending_tasks", metricType: Timer}, + TaskRedispatchQueuePendingTasksHistogram: {metricName: "task_redispatch_queue_pending_tasks_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k}, TransferTaskThrottledCounter: {metricName: "transfer_task_throttled_counter", metricType: Counter}, TimerTaskThrottledCounter: {metricName: "timer_task_throttled_counter", metricType: Counter}, CrossClusterTaskThrottledCounter: {metricName: "cross_cluster_task_throttled_counter", metricType: Counter}, @@ -3754,6 +3803,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ AcquireLockFailedCounter: {metricName: "acquire_lock_failed", metricType: Counter}, WorkflowContextCleared: {metricName: "workflow_context_cleared", metricType: Counter}, WorkflowContextLockLatency: {metricName: "workflow_context_lock_latency", metricType: Timer}, + WorkflowContextLockLatencyHistogram: {metricName: "workflow_context_lock_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, MutableStateSize: {metricName: "mutable_state_size", metricType: Timer}, MutableStateSizeHistogram: {metricName: "mutable_state_size_counts", metricType: Histogram, intExponentialBuckets: Mid8B16MB}, ExecutionInfoSize: {metricName: "execution_info_size", metricType: Timer}, @@ -3860,7 +3910,9 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ ReplicationDLQValidationFailed: {metricName: "replication_dlq_validation_failed", metricType: Counter}, ReplicationMessageTooLargePerShard: {metricName: "replication_message_too_large_per_shard", metricType: Counter}, GetReplicationMessagesForShardLatency: {metricName: "get_replication_messages_for_shard", metricType: Timer}, + GetReplicationMessagesForShardLatencyHistogram: {metricName: "get_replication_messages_for_shard_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, GetDLQReplicationMessagesLatency: {metricName: "get_dlq_replication_messages", metricType: Timer}, + GetDLQReplicationMessagesLatencyHistogram: {metricName: "get_dlq_replication_messages_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, EventReapplySkippedCount: {metricName: "event_reapply_skipped_count", metricType: Counter}, DirectQueryDispatchLatency: {metricName: "direct_query_dispatch_latency", metricType: Timer}, DirectQueryDispatchLatencyHistogram: {metricName: "direct_query_dispatch_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, @@ -3875,6 +3927,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ DirectQueryDispatchClearStickinessSuccessCount: {metricName: "direct_query_dispatch_clear_stickiness_success", metricType: Counter}, DirectQueryDispatchTimeoutBeforeNonStickyCount: {metricName: "direct_query_dispatch_timeout_before_non_sticky", metricType: Counter}, DecisionTaskQueryLatency: {metricName: "decision_task_query_latency", metricType: Timer}, + DecisionTaskQueryLatencyHistogram: {metricName: "decision_task_query_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, ConsistentQueryPerShard: {metricName: "consistent_query_per_shard", metricType: Counter}, ConsistentQueryTimeoutCount: {metricName: "consistent_query_timeout", metricType: Counter}, QueryBeforeFirstDecisionCount: {metricName: "query_before_first_decision", metricType: Counter}, @@ -3899,7 +3952,6 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ WorkflowRepairTimeout: {metricName: "workflow_repair_timeout", metricType: Counter}, WorkflowRepairDuration: {metricName: "workflow_repair_duration_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, FailoverMarkerCount: {metricName: "failover_marker_count", metricType: Counter}, - FailoverMarkerReplicationLatency: {metricName: "failover_marker_replication_latency", metricType: Timer}, FailoverMarkerInsertFailure: {metricName: "failover_marker_insert_failures", metricType: Counter}, FailoverMarkerNotificationFailure: {metricName: "failover_marker_notification_failures", metricType: Counter}, FailoverMarkerUpdateShardFailure: {metricName: "failover_marker_update_shard_failures", metricType: Counter}, @@ -3930,84 +3982,91 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ VirtualQueueRunningGauge: {metricName: "virtual_queue_running", metricType: Gauge}, }, Matching: { - PollSuccessPerTaskListCounter: {metricName: "poll_success_per_tl", metricRollupName: "poll_success"}, - PollTimeoutPerTaskListCounter: {metricName: "poll_timeouts_per_tl", metricRollupName: "poll_timeouts"}, - PollSuccessWithSyncPerTaskListCounter: {metricName: "poll_success_sync_per_tl", metricRollupName: "poll_success_sync"}, - LeaseRequestPerTaskListCounter: {metricName: "lease_requests_per_tl", metricRollupName: "lease_requests"}, - LeaseFailurePerTaskListCounter: {metricName: "lease_failures_per_tl", metricRollupName: "lease_failures"}, - ConditionFailedErrorPerTaskListCounter: {metricName: "condition_failed_errors_per_tl", metricRollupName: "condition_failed_errors"}, - RespondQueryTaskFailedPerTaskListCounter: {metricName: "respond_query_failed_per_tl", metricRollupName: "respond_query_failed"}, - SyncThrottlePerTaskListCounter: {metricName: "sync_throttle_count_per_tl", metricRollupName: "sync_throttle_count"}, - BufferThrottlePerTaskListCounter: {metricName: "buffer_throttle_count_per_tl", metricRollupName: "buffer_throttle_count"}, - BufferUnknownTaskDispatchError: {metricName: "buffer_unknown_task_dispatch_error_per_tl", metricRollupName: "buffer_unknown_task_dispatch_error"}, - BufferIsolationGroupRedirectCounter: {metricName: "buffer_isolation_group_redirected_per_tl", metricRollupName: "buffer_isolation_group_redirected"}, - BufferIsolationGroupRedirectFailureCounter: {metricName: "buffer_isolation_group_redirect_failure_per_tl", metricRollupName: "buffer_isolation_group_redirect_failure"}, - BufferIsolationGroupMisconfiguredCounter: {metricName: "buffer_isolation_group_misconfigured_failure_per_tl", metricRollupName: "buffer_isolation_group_misconfigured_failure"}, - ExpiredTasksPerTaskListCounter: {metricName: "tasks_expired_per_tl", metricRollupName: "tasks_expired"}, - ForwardedPerTaskListCounter: {metricName: "forwarded_per_tl", metricRollupName: "forwarded"}, - ForwardTaskCallsPerTaskList: {metricName: "forward_task_calls_per_tl", metricRollupName: "forward_task_calls"}, - ForwardTaskErrorsPerTaskList: {metricName: "forward_task_errors_per_tl", metricRollupName: "forward_task_errors"}, - SyncMatchForwardTaskThrottleErrorPerTasklist: {metricName: "sync_forward_task_throttle_errors_per_tl", metricRollupName: "sync_forward_task_throttle_errors"}, - AsyncMatchForwardTaskThrottleErrorPerTasklist: {metricName: "async_forward_task_throttle_errors_per_tl", metricRollupName: "async_forward_task_throttle_errors"}, - ForwardQueryCallsPerTaskList: {metricName: "forward_query_calls_per_tl", metricRollupName: "forward_query_calls"}, - ForwardQueryErrorsPerTaskList: {metricName: "forward_query_errors_per_tl", metricRollupName: "forward_query_errors"}, - ForwardPollCallsPerTaskList: {metricName: "forward_poll_calls_per_tl", metricRollupName: "forward_poll_calls"}, - ForwardPollErrorsPerTaskList: {metricName: "forward_poll_errors_per_tl", metricRollupName: "forward_poll_errors"}, - SyncMatchLatencyPerTaskList: {metricName: "syncmatch_latency_per_tl", metricRollupName: "syncmatch_latency", metricType: Timer}, - AsyncMatchLatencyPerTaskList: {metricName: "asyncmatch_latency_per_tl", metricRollupName: "asyncmatch_latency", metricType: Timer}, - AsyncMatchDispatchLatencyPerTaskList: {metricName: "asyncmatch_dispatch_latency_per_tl", metricRollupName: "asyncmatch_dispatch_latency", metricType: Timer}, - AsyncMatchDispatchTimeoutCounterPerTaskList: {metricName: "asyncmatch_dispatch_timeouts_per_tl", metricRollupName: "asyncmatch_dispatch_timeouts"}, - ForwardTaskLatencyPerTaskList: {metricName: "forward_task_latency_per_tl", metricRollupName: "forward_task_latency"}, - ForwardQueryLatencyPerTaskList: {metricName: "forward_query_latency_per_tl", metricRollupName: "forward_query_latency"}, - ForwardPollLatencyPerTaskList: {metricName: "forward_poll_latency_per_tl", metricRollupName: "forward_poll_latency"}, - LocalToLocalMatchPerTaskListCounter: {metricName: "local_to_local_matches_per_tl", metricRollupName: "local_to_local_matches"}, - LocalToRemoteMatchPerTaskListCounter: {metricName: "local_to_remote_matches_per_tl", metricRollupName: "local_to_remote_matches"}, - RemoteToLocalMatchPerTaskListCounter: {metricName: "remote_to_local_matches_per_tl", metricRollupName: "remote_to_local_matches"}, - RemoteToRemoteMatchPerTaskListCounter: {metricName: "remote_to_remote_matches_per_tl", metricRollupName: "remote_to_remote_matches"}, - IsolationTaskMatchPerTaskListCounter: {metricName: "isolation_task_matches_per_tl", metricType: Counter}, - IsolationSuccessPerTaskListCounter: {metricName: "isolation_success_per_tl", metricRollupName: "isolation_success"}, - PollerPerTaskListCounter: {metricName: "poller_count_per_tl", metricRollupName: "poller_count"}, - PollerInvalidIsolationGroupCounter: {metricName: "poller_invalid_isolation_group_per_tl", metricType: Counter}, - TaskListPartitionUpdateFailedCounter: {metricName: "tasklist_partition_update_failed_per_tl", metricType: Counter}, - TaskListManagersGauge: {metricName: "tasklist_managers", metricType: Gauge}, - TaskLagPerTaskListGauge: {metricName: "task_lag_per_tl", metricType: Gauge}, - TaskBacklogPerTaskListGauge: {metricName: "task_backlog_per_tl", metricType: Gauge}, - TaskCountPerTaskListGauge: {metricName: "task_count_per_tl", metricType: Gauge}, - RateLimitPerTaskListGauge: {metricName: "rate_limit_per_tl", metricType: Gauge}, - SyncMatchLocalPollLatencyPerTaskList: {metricName: "syncmatch_local_poll_latency_per_tl", metricRollupName: "syncmatch_local_poll_latency"}, - SyncMatchForwardPollLatencyPerTaskList: {metricName: "syncmatch_forward_poll_latency_per_tl", metricRollupName: "syncmatch_forward_poll_latency"}, - AsyncMatchLocalPollCounterPerTaskList: {metricName: "asyncmatch_local_poll_per_tl", metricRollupName: "asyncmatch_local_poll"}, - AsyncMatchLocalPollAttemptPerTaskList: {metricName: "asyncmatch_local_poll_attempt_per_tl", metricRollupName: "asyncmatch_local_poll_attempt", metricType: Timer}, - AsyncMatchLocalPollLatencyPerTaskList: {metricName: "asyncmatch_local_poll_latency_per_tl", metricRollupName: "asyncmatch_local_poll_latency"}, - AsyncMatchForwardPollCounterPerTaskList: {metricName: "asyncmatch_forward_poll_per_tl", metricRollupName: "asyncmatch_forward_poll"}, - AsyncMatchForwardPollAttemptPerTaskList: {metricName: "asyncmatch_forward_poll_attempt_per_tl", metricRollupName: "asyncmatch_forward_poll_attempt", metricType: Timer}, - AsyncMatchForwardPollLatencyPerTaskList: {metricName: "asyncmatch_forward_poll_latency_per_tl", metricRollupName: "asyncmatch_forward_poll_latency"}, - AsyncMatchLocalPollAfterForwardFailedCounterPerTaskList: {metricName: "asyncmatch_local_poll_after_forward_failed_per_tl", metricRollupName: "asyncmatch_local_poll_after_forward_failed"}, - AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskList: {metricName: "asyncmatch_local_poll_after_forward_failed_attempt_per_tl", metricRollupName: "asyncmatch_local_poll_after_forward_failed_attempt", metricType: Timer}, - AsyncMatchLocalPollAfterForwardFailedLatencyPerTaskList: {metricName: "asyncmatch_local_poll_after_forward_failed_latency_per_tl", metricRollupName: "asyncmatch_local_poll_after_forward_failed_latency"}, - PollLocalMatchLatencyPerTaskList: {metricName: "poll_local_match_latency_per_tl", metricRollupName: "poll_local_match_latency", metricType: Timer}, - PollForwardMatchLatencyPerTaskList: {metricName: "poll_forward_match_latency_per_tl", metricRollupName: "poll_forward_match_latency", metricType: Timer}, - PollLocalMatchAfterForwardFailedLatencyPerTaskList: {metricName: "poll_local_match_after_forward_failed_latency_per_tl", metricRollupName: "poll_local_match_after_forward_failed_latency", metricType: Timer}, - PollDecisionTaskAlreadyStartedCounterPerTaskList: {metricName: "poll_decision_task_already_started_per_tl", metricType: Counter}, - PollActivityTaskAlreadyStartedCounterPerTaskList: {metricName: "poll_activity_task_already_started_per_tl", metricType: Counter}, - TaskListReadWritePartitionMismatchGauge: {metricName: "tasklist_read_write_partition_mismatch", metricType: Gauge}, - TaskListPollerPartitionMismatchGauge: {metricName: "tasklist_poller_partition_mismatch", metricType: Gauge}, - EstimatedAddTaskQPSGauge: {metricName: "estimated_add_task_qps_per_tl", metricType: Gauge}, - TaskListPartitionUpscaleThresholdGauge: {metricName: "tasklist_partition_upscale_threshold", metricType: Gauge}, - TaskListPartitionDownscaleThresholdGauge: {metricName: "tasklist_partition_downscale_threshold", metricType: Gauge}, - StandbyClusterTasksCompletedCounterPerTaskList: {metricName: "standby_cluster_tasks_completed_per_tl", metricType: Counter}, - StandbyClusterTasksNotStartedCounterPerTaskList: {metricName: "standby_cluster_tasks_not_started_per_tl", metricType: Counter}, - StandbyClusterTasksCompletionFailurePerTaskList: {metricName: "standby_cluster_tasks_completion_failure_per_tl", metricType: Counter}, - TaskIsolationLeakPerTaskList: {metricName: "task_isolation_leak_per_tl", metricRollupName: "task_isolation_leak"}, - PartitionUpscale: {metricName: "partition_upscale_per_tl", metricRollupName: "partition_upscale"}, - PartitionDownscale: {metricName: "partition_downscale_per_tl", metricRollupName: "partition_downscale"}, - PartitionDrained: {metricName: "partition_drained_per_tl", metricRollupName: "partition_drained"}, - IsolationRebalance: {metricName: "isolation_rebalance_per_tl", metricRollupName: "isolation_rebalance"}, - IsolationGroupStartedPolling: {metricName: "ig_started_polling_per_tl", metricRollupName: "ig_started_polling"}, - IsolationGroupStoppedPolling: {metricName: "ig_stopped_polling_per_tl", metricRollupName: "ig_stopped_polling"}, - IsolationGroupUpscale: {metricName: "ig_upscale_per_tl", metricRollupName: "ig_upscale"}, - IsolationGroupDownscale: {metricName: "ig_downscale_per_tl", metricRollupName: "ig_downscale"}, - IsolationGroupPartitionsGauge: {metricName: "ig_partitions_per_tl", metricType: Gauge}, + PollSuccessPerTaskListCounter: {metricName: "poll_success_per_tl", metricRollupName: "poll_success"}, + PollTimeoutPerTaskListCounter: {metricName: "poll_timeouts_per_tl", metricRollupName: "poll_timeouts"}, + PollSuccessWithSyncPerTaskListCounter: {metricName: "poll_success_sync_per_tl", metricRollupName: "poll_success_sync"}, + LeaseRequestPerTaskListCounter: {metricName: "lease_requests_per_tl", metricRollupName: "lease_requests"}, + LeaseFailurePerTaskListCounter: {metricName: "lease_failures_per_tl", metricRollupName: "lease_failures"}, + ConditionFailedErrorPerTaskListCounter: {metricName: "condition_failed_errors_per_tl", metricRollupName: "condition_failed_errors"}, + RespondQueryTaskFailedPerTaskListCounter: {metricName: "respond_query_failed_per_tl", metricRollupName: "respond_query_failed"}, + SyncThrottlePerTaskListCounter: {metricName: "sync_throttle_count_per_tl", metricRollupName: "sync_throttle_count"}, + BufferThrottlePerTaskListCounter: {metricName: "buffer_throttle_count_per_tl", metricRollupName: "buffer_throttle_count"}, + BufferUnknownTaskDispatchError: {metricName: "buffer_unknown_task_dispatch_error_per_tl", metricRollupName: "buffer_unknown_task_dispatch_error"}, + BufferIsolationGroupRedirectCounter: {metricName: "buffer_isolation_group_redirected_per_tl", metricRollupName: "buffer_isolation_group_redirected"}, + BufferIsolationGroupRedirectFailureCounter: {metricName: "buffer_isolation_group_redirect_failure_per_tl", metricRollupName: "buffer_isolation_group_redirect_failure"}, + BufferIsolationGroupMisconfiguredCounter: {metricName: "buffer_isolation_group_misconfigured_failure_per_tl", metricRollupName: "buffer_isolation_group_misconfigured_failure"}, + ExpiredTasksPerTaskListCounter: {metricName: "tasks_expired_per_tl", metricRollupName: "tasks_expired"}, + ForwardedPerTaskListCounter: {metricName: "forwarded_per_tl", metricRollupName: "forwarded"}, + ForwardTaskCallsPerTaskList: {metricName: "forward_task_calls_per_tl", metricRollupName: "forward_task_calls"}, + ForwardTaskErrorsPerTaskList: {metricName: "forward_task_errors_per_tl", metricRollupName: "forward_task_errors"}, + SyncMatchForwardTaskThrottleErrorPerTasklist: {metricName: "sync_forward_task_throttle_errors_per_tl", metricRollupName: "sync_forward_task_throttle_errors"}, + AsyncMatchForwardTaskThrottleErrorPerTasklist: {metricName: "async_forward_task_throttle_errors_per_tl", metricRollupName: "async_forward_task_throttle_errors"}, + ForwardQueryCallsPerTaskList: {metricName: "forward_query_calls_per_tl", metricRollupName: "forward_query_calls"}, + ForwardQueryErrorsPerTaskList: {metricName: "forward_query_errors_per_tl", metricRollupName: "forward_query_errors"}, + ForwardPollCallsPerTaskList: {metricName: "forward_poll_calls_per_tl", metricRollupName: "forward_poll_calls"}, + ForwardPollErrorsPerTaskList: {metricName: "forward_poll_errors_per_tl", metricRollupName: "forward_poll_errors"}, + SyncMatchLatencyPerTaskList: {metricName: "syncmatch_latency_per_tl", metricRollupName: "syncmatch_latency", metricType: Timer}, + SyncMatchLatencyPerTaskListHistogram: {metricName: "syncmatch_latency_per_tl_ns", metricRollupName: "syncmatch_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, + AsyncMatchLatencyPerTaskList: {metricName: "asyncmatch_latency_per_tl", metricRollupName: "asyncmatch_latency", metricType: Timer}, + AsyncMatchLatencyPerTaskListHistogram: {metricName: "asyncmatch_latency_per_tl_ns", metricRollupName: "asyncmatch_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, + AsyncMatchDispatchTimeoutCounterPerTaskList: {metricName: "asyncmatch_dispatch_timeouts_per_tl", metricRollupName: "asyncmatch_dispatch_timeouts"}, + ForwardTaskLatencyPerTaskList: {metricName: "forward_task_latency_per_tl", metricRollupName: "forward_task_latency"}, + ForwardQueryLatencyPerTaskList: {metricName: "forward_query_latency_per_tl", metricRollupName: "forward_query_latency"}, + ForwardPollLatencyPerTaskList: {metricName: "forward_poll_latency_per_tl", metricRollupName: "forward_poll_latency"}, + LocalToLocalMatchPerTaskListCounter: {metricName: "local_to_local_matches_per_tl", metricRollupName: "local_to_local_matches"}, + LocalToRemoteMatchPerTaskListCounter: {metricName: "local_to_remote_matches_per_tl", metricRollupName: "local_to_remote_matches"}, + RemoteToLocalMatchPerTaskListCounter: {metricName: "remote_to_local_matches_per_tl", metricRollupName: "remote_to_local_matches"}, + RemoteToRemoteMatchPerTaskListCounter: {metricName: "remote_to_remote_matches_per_tl", metricRollupName: "remote_to_remote_matches"}, + IsolationTaskMatchPerTaskListCounter: {metricName: "isolation_task_matches_per_tl", metricType: Counter}, + IsolationSuccessPerTaskListCounter: {metricName: "isolation_success_per_tl", metricRollupName: "isolation_success"}, + PollerPerTaskListCounter: {metricName: "poller_count_per_tl", metricRollupName: "poller_count"}, + PollerInvalidIsolationGroupCounter: {metricName: "poller_invalid_isolation_group_per_tl", metricType: Counter}, + TaskListPartitionUpdateFailedCounter: {metricName: "tasklist_partition_update_failed_per_tl", metricType: Counter}, + TaskListManagersGauge: {metricName: "tasklist_managers", metricType: Gauge}, + TaskLagPerTaskListGauge: {metricName: "task_lag_per_tl", metricType: Gauge}, + TaskBacklogPerTaskListGauge: {metricName: "task_backlog_per_tl", metricType: Gauge}, + TaskCountPerTaskListGauge: {metricName: "task_count_per_tl", metricType: Gauge}, + RateLimitPerTaskListGauge: {metricName: "rate_limit_per_tl", metricType: Gauge}, + SyncMatchLocalPollLatencyPerTaskList: {metricName: "syncmatch_local_poll_latency_per_tl", metricRollupName: "syncmatch_local_poll_latency"}, + SyncMatchForwardPollLatencyPerTaskList: {metricName: "syncmatch_forward_poll_latency_per_tl", metricRollupName: "syncmatch_forward_poll_latency"}, + AsyncMatchLocalPollCounterPerTaskList: {metricName: "asyncmatch_local_poll_per_tl", metricRollupName: "asyncmatch_local_poll"}, + AsyncMatchLocalPollAttemptPerTaskList: {metricName: "asyncmatch_local_poll_attempt_per_tl", metricRollupName: "asyncmatch_local_poll_attempt", metricType: Timer}, + AsyncMatchLocalPollAttemptPerTaskListHistogram: {metricName: "asyncmatch_local_poll_attempt_per_tl_counts", metricRollupName: "asyncmatch_local_poll_attempt_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k}, + AsyncMatchLocalPollLatencyPerTaskList: {metricName: "asyncmatch_local_poll_latency_per_tl", metricRollupName: "asyncmatch_local_poll_latency"}, + AsyncMatchForwardPollCounterPerTaskList: {metricName: "asyncmatch_forward_poll_per_tl", metricRollupName: "asyncmatch_forward_poll"}, + AsyncMatchForwardPollAttemptPerTaskList: {metricName: "asyncmatch_forward_poll_attempt_per_tl", metricRollupName: "asyncmatch_forward_poll_attempt", metricType: Timer}, + AsyncMatchForwardPollAttemptPerTaskListHistogram: {metricName: "asyncmatch_forward_poll_attempt_per_tl_counts", metricRollupName: "asyncmatch_forward_poll_attempt_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k}, + AsyncMatchForwardPollLatencyPerTaskList: {metricName: "asyncmatch_forward_poll_latency_per_tl", metricRollupName: "asyncmatch_forward_poll_latency"}, + AsyncMatchLocalPollAfterForwardFailedCounterPerTaskList: {metricName: "asyncmatch_local_poll_after_forward_failed_per_tl", metricRollupName: "asyncmatch_local_poll_after_forward_failed"}, + AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskList: {metricName: "asyncmatch_local_poll_after_forward_failed_attempt_per_tl", metricRollupName: "asyncmatch_local_poll_after_forward_failed_attempt", metricType: Timer}, + AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskListHistogram: {metricName: "asyncmatch_local_poll_after_forward_failed_attempt_per_tl_counts", metricRollupName: "asyncmatch_local_poll_after_forward_failed_attempt_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k}, + AsyncMatchLocalPollAfterForwardFailedLatencyPerTaskList: {metricName: "asyncmatch_local_poll_after_forward_failed_latency_per_tl", metricRollupName: "asyncmatch_local_poll_after_forward_failed_latency"}, + PollLocalMatchLatencyPerTaskList: {metricName: "poll_local_match_latency_per_tl", metricRollupName: "poll_local_match_latency", metricType: Timer}, + PollLocalMatchLatencyPerTaskListHistogram: {metricName: "poll_local_match_latency_per_tl_ns", metricRollupName: "poll_local_match_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, + PollForwardMatchLatencyPerTaskList: {metricName: "poll_forward_match_latency_per_tl", metricRollupName: "poll_forward_match_latency", metricType: Timer}, + PollForwardMatchLatencyPerTaskListHistogram: {metricName: "poll_forward_match_latency_per_tl_ns", metricRollupName: "poll_forward_match_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, + PollLocalMatchAfterForwardFailedLatencyPerTaskList: {metricName: "poll_local_match_after_forward_failed_latency_per_tl", metricRollupName: "poll_local_match_after_forward_failed_latency", metricType: Timer}, + PollLocalMatchAfterForwardFailedLatencyPerTaskListHistogram: {metricName: "poll_local_match_after_forward_failed_latency_per_tl_ns", metricRollupName: "poll_local_match_after_forward_failed_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, + PollDecisionTaskAlreadyStartedCounterPerTaskList: {metricName: "poll_decision_task_already_started_per_tl", metricType: Counter}, + PollActivityTaskAlreadyStartedCounterPerTaskList: {metricName: "poll_activity_task_already_started_per_tl", metricType: Counter}, + TaskListReadWritePartitionMismatchGauge: {metricName: "tasklist_read_write_partition_mismatch", metricType: Gauge}, + TaskListPollerPartitionMismatchGauge: {metricName: "tasklist_poller_partition_mismatch", metricType: Gauge}, + EstimatedAddTaskQPSGauge: {metricName: "estimated_add_task_qps_per_tl", metricType: Gauge}, + TaskListPartitionUpscaleThresholdGauge: {metricName: "tasklist_partition_upscale_threshold", metricType: Gauge}, + TaskListPartitionDownscaleThresholdGauge: {metricName: "tasklist_partition_downscale_threshold", metricType: Gauge}, + StandbyClusterTasksCompletedCounterPerTaskList: {metricName: "standby_cluster_tasks_completed_per_tl", metricType: Counter}, + StandbyClusterTasksNotStartedCounterPerTaskList: {metricName: "standby_cluster_tasks_not_started_per_tl", metricType: Counter}, + StandbyClusterTasksCompletionFailurePerTaskList: {metricName: "standby_cluster_tasks_completion_failure_per_tl", metricType: Counter}, + TaskIsolationLeakPerTaskList: {metricName: "task_isolation_leak_per_tl", metricRollupName: "task_isolation_leak"}, + PartitionUpscale: {metricName: "partition_upscale_per_tl", metricRollupName: "partition_upscale"}, + PartitionDownscale: {metricName: "partition_downscale_per_tl", metricRollupName: "partition_downscale"}, + PartitionDrained: {metricName: "partition_drained_per_tl", metricRollupName: "partition_drained"}, + IsolationRebalance: {metricName: "isolation_rebalance_per_tl", metricRollupName: "isolation_rebalance"}, + IsolationGroupStartedPolling: {metricName: "ig_started_polling_per_tl", metricRollupName: "ig_started_polling"}, + IsolationGroupStoppedPolling: {metricName: "ig_stopped_polling_per_tl", metricRollupName: "ig_stopped_polling"}, + IsolationGroupUpscale: {metricName: "ig_upscale_per_tl", metricRollupName: "ig_upscale"}, + IsolationGroupDownscale: {metricName: "ig_downscale_per_tl", metricRollupName: "ig_downscale"}, + IsolationGroupPartitionsGauge: {metricName: "ig_partitions_per_tl", metricType: Gauge}, }, Worker: { ReplicatorMessages: {metricName: "replicator_messages"}, @@ -4020,8 +4079,10 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ ESProcessorFailures: {metricName: "es_processor_errors"}, ESProcessorCorruptedData: {metricName: "es_processor_corrupted_data"}, ESProcessorProcessMsgLatency: {metricName: "es_processor_process_msg_latency", metricType: Timer}, + ESProcessorProcessMsgLatencyHistogram: {metricName: "es_processor_process_msg_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, IndexProcessorCorruptedData: {metricName: "index_processor_corrupted_data"}, IndexProcessorProcessMsgLatency: {metricName: "index_processor_process_msg_latency", metricType: Timer}, + IndexProcessorProcessMsgLatencyHistogram: {metricName: "index_processor_process_msg_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, ArchiverNonRetryableErrorCount: {metricName: "archiver_non_retryable_error"}, ArchiverStartedCount: {metricName: "archiver_started"}, ArchiverStoppedCount: {metricName: "archiver_stopped"}, @@ -4088,12 +4149,14 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ ESAnalyzerNumLongRunningWorkflows: {metricName: "es_analyzer_num_long_running_workflows", metricType: Counter}, AsyncWorkflowConsumerCount: {metricName: "async_workflow_consumer_count", metricType: Gauge}, AsyncWorkflowProcessMsgLatency: {metricName: "async_workflow_process_msg_latency", metricType: Timer}, + AsyncWorkflowProcessMsgLatencyHistogram: {metricName: "async_workflow_process_msg_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, AsyncWorkflowFailureCorruptMsgCount: {metricName: "async_workflow_failure_corrupt_msg", metricType: Counter}, AsyncWorkflowFailureByFrontendCount: {metricName: "async_workflow_failure_by_frontend", metricType: Counter}, AsyncWorkflowSuccessCount: {metricName: "async_workflow_success", metricType: Counter}, DiagnosticsWorkflowStartedCount: {metricName: "diagnostics_workflow_count", metricType: Counter}, DiagnosticsWorkflowSuccess: {metricName: "diagnostics_workflow_success", metricType: Counter}, DiagnosticsWorkflowExecutionLatency: {metricName: "diagnostics_workflow_execution_latency", metricType: Timer}, + DiagnosticsWorkflowExecutionLatencyHistogram: {metricName: "diagnostics_workflow_execution_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, SchedulerWorkerActiveGauge: {metricName: "scheduler_worker_active_gauge", metricType: Gauge}, SchedulerWorkerStartedCount: {metricName: "scheduler_worker_started_count", metricType: Counter}, SchedulerWorkerStoppedCount: {metricName: "scheduler_worker_stopped_count", metricType: Counter}, @@ -4115,6 +4178,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ ShardDistributorErrContextTimeoutCounter: {metricName: "shard_distributor_err_context_timeout", metricType: Counter}, ShardDistributorFailures: {metricName: "shard_distributor_failures", metricType: Counter}, ShardDistributorLatency: {metricName: "shard_distributor_latency", metricType: Timer}, + ShardDistributorLatencyHistogram: {metricName: "shard_distributor_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, ShardDistributorErrNamespaceNotFound: {metricName: "shard_distributor_err_namespace_not_found", metricType: Counter}, ShardDistributorErrShardNotFound: {metricName: "shard_distributor_err_shard_not_found", metricType: Counter}, ShardDistributorAssignLoopShardRebalanceLatency: {metricName: "shard_distrubutor_shard_assign_latency", metricType: Histogram}, diff --git a/common/metrics/scope.go b/common/metrics/scope.go index 43965b6910e..9f14d9fb8c4 100644 --- a/common/metrics/scope.go +++ b/common/metrics/scope.go @@ -123,10 +123,15 @@ func (m *metricsScope) RecordHistogramDuration(id MetricIdx, value time.Duration if m.migrationConfig.Histogram.EmitHistogram(def.metricName.String()) { m.scope.Histogram(def.metricName.String(), m.getBuckets(id)).RecordDuration(value) } - if !def.metricRollupName.Empty() { + switch { + case !def.metricRollupName.Empty(): if m.migrationConfig.Histogram.EmitHistogram(def.metricRollupName.String()) { m.rootScope.Histogram(def.metricRollupName.String(), m.getBuckets(id)).RecordDuration(value) } + case m.isDomainTagged: + if m.migrationConfig.Histogram.EmitHistogram(def.metricName.String()) { + m.scope.Tagged(map[string]string{domain: allValue}).Histogram(def.metricName.String(), m.getBuckets(id)).RecordDuration(value) + } } } diff --git a/common/metrics/scope_test.go b/common/metrics/scope_test.go index c9927f6b941..c37eee29b2c 100644 --- a/common/metrics/scope_test.go +++ b/common/metrics/scope_test.go @@ -426,3 +426,38 @@ func TestGaugeRollupUsesRootScope(t *testing.T) { } assert.True(t, foundInRoot, "rollup gauge should be emitted on root scope") } + +func TestRecordHistogramDurationDomainTaggedDualEmit(t *testing.T) { + rootScope := tally.NewTestScope("", nil) + childScope := tally.NewTestScope("", map[string]string{domain: "test-domain"}) + + defs := map[MetricIdx]metricDefinition{ + CadenceLatencyHistogram: { + metricName: "cadence_latency_ns", + metricType: Histogram, + }, + } + + scope := newMetricsScope(rootScope, childScope, defs, true, MigrationConfig{ + Histogram: HistogramMigration{Default: "histogram"}, + }) + + scope.RecordHistogramDuration(CadenceLatencyHistogram, 5*time.Millisecond) + + // per-domain series emitted on child scope + domainFound := false + allFound := false + for _, h := range childScope.Snapshot().Histograms() { + if h.Name() == "cadence_latency_ns" { + tags := h.Tags() + if tags[domain] == "test-domain" { + domainFound = true + } + if tags[domain] == allValue { + allFound = true + } + } + } + assert.True(t, domainFound, "per-domain histogram series should be emitted") + assert.True(t, allFound, "aggregate domain=all histogram series should be emitted") +} diff --git a/common/persistence/pinot/pinot_visibility_metric_clients.go b/common/persistence/pinot/pinot_visibility_metric_clients.go index 98011ea737c..a3dbfe827fd 100644 --- a/common/persistence/pinot/pinot_visibility_metric_clients.go +++ b/common/persistence/pinot/pinot_visibility_metric_clients.go @@ -22,6 +22,7 @@ package pinotvisibility import ( "context" + "time" "github.com/uber/cadence/common/log" "github.com/uber/cadence/common/log/tag" @@ -59,8 +60,12 @@ func (p *pinotVisibilityMetricsClient) RecordWorkflowExecutionStarted( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotRecordWorkflowExecutionStartedScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() err := p.persistence.RecordWorkflowExecutionStarted(ctx, request) if err != nil { @@ -78,8 +83,12 @@ func (p *pinotVisibilityMetricsClient) RecordWorkflowExecutionClosed( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotRecordWorkflowExecutionClosedScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() err := p.persistence.RecordWorkflowExecutionClosed(ctx, request) if err != nil { @@ -97,8 +106,12 @@ func (p *pinotVisibilityMetricsClient) RecordWorkflowExecutionUninitialized( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotRecordWorkflowExecutionUninitializedScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() err := p.persistence.RecordWorkflowExecutionUninitialized(ctx, request) if err != nil { @@ -116,8 +129,12 @@ func (p *pinotVisibilityMetricsClient) UpsertWorkflowExecution( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotUpsertWorkflowExecutionScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() err := p.persistence.UpsertWorkflowExecution(ctx, request) if err != nil { @@ -135,8 +152,12 @@ func (p *pinotVisibilityMetricsClient) ListOpenWorkflowExecutions( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListOpenWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListOpenWorkflowExecutions(ctx, request) if err != nil { @@ -154,8 +175,12 @@ func (p *pinotVisibilityMetricsClient) ListClosedWorkflowExecutions( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListClosedWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListClosedWorkflowExecutions(ctx, request) if err != nil { @@ -173,8 +198,12 @@ func (p *pinotVisibilityMetricsClient) ListOpenWorkflowExecutionsByType( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListOpenWorkflowExecutionsByTypeScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListOpenWorkflowExecutionsByType(ctx, request) if err != nil { @@ -191,8 +220,12 @@ func (p *pinotVisibilityMetricsClient) ListClosedWorkflowExecutionsByType( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListClosedWorkflowExecutionsByTypeScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListClosedWorkflowExecutionsByType(ctx, request) if err != nil { @@ -209,8 +242,12 @@ func (p *pinotVisibilityMetricsClient) ListOpenWorkflowExecutionsByWorkflowID( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListOpenWorkflowExecutionsByWorkflowIDScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListOpenWorkflowExecutionsByWorkflowID(ctx, request) if err != nil { @@ -227,8 +264,12 @@ func (p *pinotVisibilityMetricsClient) ListClosedWorkflowExecutionsByWorkflowID( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListClosedWorkflowExecutionsByWorkflowIDScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListClosedWorkflowExecutionsByWorkflowID(ctx, request) if err != nil { @@ -245,8 +286,12 @@ func (p *pinotVisibilityMetricsClient) ListClosedWorkflowExecutionsByStatus( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListClosedWorkflowExecutionsByStatusScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListClosedWorkflowExecutionsByStatus(ctx, request) if err != nil { @@ -263,8 +308,12 @@ func (p *pinotVisibilityMetricsClient) GetClosedWorkflowExecution( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotGetClosedWorkflowExecutionScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.GetClosedWorkflowExecution(ctx, request) if err != nil { @@ -281,8 +330,12 @@ func (p *pinotVisibilityMetricsClient) ListWorkflowExecutions( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotListWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ListWorkflowExecutions(ctx, request) if err != nil { @@ -299,8 +352,12 @@ func (p *pinotVisibilityMetricsClient) ScanWorkflowExecutions( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotScanWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.ScanWorkflowExecutions(ctx, request) if err != nil { @@ -317,8 +374,12 @@ func (p *pinotVisibilityMetricsClient) CountWorkflowExecutions( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotCountWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() response, err := p.persistence.CountWorkflowExecutions(ctx, request) if err != nil { @@ -335,8 +396,12 @@ func (p *pinotVisibilityMetricsClient) DeleteWorkflowExecution( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotDeleteWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() err := p.persistence.DeleteWorkflowExecution(ctx, request) if err != nil { @@ -353,8 +418,12 @@ func (p *pinotVisibilityMetricsClient) DeleteUninitializedWorkflowExecution( scopeWithDomainTag := p.metricClient.Scope(metrics.PinotDeleteWorkflowExecutionsScope, metrics.DomainTag(request.Domain)) scopeWithDomainTag.IncCounter(metrics.PinotRequestsPerDomain) + pinotStart := time.Now() sw := scopeWithDomainTag.StartTimer(metrics.PinotLatencyPerDomain) - defer sw.Stop() + defer func() { + sw.Stop() + scopeWithDomainTag.RecordHistogramDuration(metrics.PinotLatencyPerDomainHistogram, time.Since(pinotStart)) + }() err := p.persistence.DeleteWorkflowExecution(ctx, request) if err != nil { diff --git a/common/persistence/pinot/pinot_visibility_metric_clients_test.go b/common/persistence/pinot/pinot_visibility_metric_clients_test.go index b79fa67a417..50e601372ea 100644 --- a/common/persistence/pinot/pinot_visibility_metric_clients_test.go +++ b/common/persistence/pinot/pinot_visibility_metric_clients_test.go @@ -128,6 +128,7 @@ func TestMetricClientRecordWorkflowExecutionStarted(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.producerMockAffordance(mockProducer) test.scopeMockAffordance(mockScope) @@ -214,6 +215,7 @@ func TestMetricClientRecordWorkflowExecutionClosed(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.producerMockAffordance(mockProducer) test.scopeMockAffordance(mockScope) @@ -290,6 +292,7 @@ func TestMetricClientRecordWorkflowExecutionUninitialized(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.producerMockAffordance(mockProducer) test.scopeMockAffordance(mockScope) @@ -366,6 +369,7 @@ func TestMetricClientUpsertWorkflowExecution(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.producerMockAffordance(mockProducer) test.scopeMockAffordance(mockScope) @@ -440,6 +444,7 @@ func TestMetricClientListOpenWorkflowExecutions(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -514,6 +519,7 @@ func TestMetricClientListClosedWorkflowExecutions(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -589,6 +595,7 @@ func TestMetricClientListOpenWorkflowExecutionsByType(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -664,6 +671,7 @@ func TestMetricClientListClosedWorkflowExecutionsByType(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -738,6 +746,7 @@ func TestMetricClientListOpenWorkflowExecutionsByWorkflowID(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -812,6 +821,7 @@ func TestMetricClientListClosedWorkflowExecutionsByWorkflowID(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -890,6 +900,7 @@ func TestMetricClientListClosedWorkflowExecutionsByStatus(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -966,6 +977,7 @@ func TestMetricClientGetClosedWorkflowExecution(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -1038,6 +1050,7 @@ func TestMetricClientListWorkflowExecutions(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -1106,6 +1119,7 @@ func TestMetricClientScanWorkflowExecutions(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -1174,6 +1188,7 @@ func TestMetricClientCountWorkflowExecutions(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.pinotClientMockAffordance(mockPinotClient) test.scopeMockAffordance(mockScope) @@ -1250,6 +1265,7 @@ func TestMetricClientDeleteWorkflowExecution(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.producerMockAffordance(mockProducer) test.scopeMockAffordance(mockScope) @@ -1322,6 +1338,7 @@ func TestMetricClientDeleteUninitializedWorkflowExecution(t *testing.T) { // mock behaviors mockMetricClient.On("Scope", mock.Anything, mock.Anything).Return(mockScope).Once() mockScope.On("StartTimer", mock.Anything, mock.Anything).Return(testStopwatch).Once() + mockScope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Once() test.producerMockAffordance(mockProducer) test.scopeMockAffordance(mockScope) diff --git a/common/quotas/global/collection/collection.go b/common/quotas/global/collection/collection.go index d09a13bbc9f..764c6283236 100644 --- a/common/quotas/global/collection/collection.go +++ b/common/quotas/global/collection/collection.go @@ -407,9 +407,11 @@ func (c *Collection) backgroundUpdateLoop() { c.scope.RecordHistogramValue(metrics.GlobalRatelimiterGlobalUsageHistogram, float64(globals)) if len(usage) > 0 { + ratelimiterUpdateStart := time.Now() sw := c.scope.StartTimer(metrics.GlobalRatelimiterUpdateLatency) c.doUpdate(now.Sub(lastGatherTime), usage) sw.Stop() + c.scope.RecordHistogramDuration(metrics.GlobalRatelimiterUpdateLatencyHistogram, time.Since(ratelimiterUpdateStart)) } <-localMetricsDone // should be much faster than doUpdate, unless it's no-opped diff --git a/common/task/hierarchical_weighted_round_robin_task_scheduler.go b/common/task/hierarchical_weighted_round_robin_task_scheduler.go index 3aba59f3f23..54e4316972e 100644 --- a/common/task/hierarchical_weighted_round_robin_task_scheduler.go +++ b/common/task/hierarchical_weighted_round_robin_task_scheduler.go @@ -116,8 +116,12 @@ func (w *hierarchicalWeightedRoundRobinTaskSchedulerImpl[K, T]) Stop() { func (w *hierarchicalWeightedRoundRobinTaskSchedulerImpl[K, T]) Submit(task T) error { w.metricsScope.IncCounter(metrics.PriorityTaskSubmitRequest) + submitStart := time.Now() sw := w.metricsScope.StartTimer(metrics.PriorityTaskSubmitLatency) - defer sw.Stop() + defer func() { + sw.Stop() + w.metricsScope.RecordHistogramDuration(metrics.PriorityTaskSubmitLatencyHistogram, time.Since(submitStart)) + }() w.RLock() defer w.RUnlock() @@ -137,8 +141,12 @@ func (w *hierarchicalWeightedRoundRobinTaskSchedulerImpl[K, T]) TrySubmit( task T, ) (bool, error) { w.metricsScope.IncCounter(metrics.PriorityTaskSubmitRequest) + submitStart := time.Now() sw := w.metricsScope.StartTimer(metrics.PriorityTaskSubmitLatency) - defer sw.Stop() + defer func() { + sw.Stop() + w.metricsScope.RecordHistogramDuration(metrics.PriorityTaskSubmitLatencyHistogram, time.Since(submitStart)) + }() w.RLock() defer w.RUnlock() diff --git a/common/task/sequential_task_processor.go b/common/task/sequential_task_processor.go index ff9a2fa5126..f3cb389161a 100644 --- a/common/task/sequential_task_processor.go +++ b/common/task/sequential_task_processor.go @@ -96,8 +96,12 @@ func (t *sequentialTaskProcessorImpl) Stop() { func (t *sequentialTaskProcessorImpl) Submit(task Task) error { t.metricsScope.IncCounter(metrics.SequentialTaskSubmitRequest) + submitStart := time.Now() metricsTimer := t.metricsScope.StartTimer(metrics.SequentialTaskSubmitLatency) - defer metricsTimer.Stop() + defer func() { + metricsTimer.Stop() + t.metricsScope.RecordHistogramDuration(metrics.SequentialTaskSubmitLatencyHistogram, time.Since(submitStart)) + }() taskqueue := t.taskQueueFactory(task) taskqueue.Add(task) @@ -139,9 +143,11 @@ func (t *sequentialTaskProcessorImpl) pollAndProcessTaskQueue() { case <-t.shutdownChan: return case taskqueue := <-t.taskqueueChan: + queueProcessingStart := time.Now() metricsTimer := t.metricsScope.StartTimer(metrics.SequentialTaskQueueProcessingLatency) t.processTaskQueue(taskqueue) metricsTimer.Stop() + t.metricsScope.RecordHistogramDuration(metrics.SequentialTaskQueueProcessingLatencyHistogram, time.Since(queueProcessingStart)) } } } @@ -154,6 +160,7 @@ func (t *sequentialTaskProcessorImpl) processTaskQueue(taskqueue SequentialTaskQ default: queueSize := taskqueue.Len() t.metricsScope.RecordTimer(metrics.SequentialTaskQueueSize, time.Duration(queueSize)) + t.metricsScope.IntExponentialHistogram(metrics.SequentialTaskQueueSizeHistogram, queueSize) if queueSize > 0 { t.processTaskOnce(taskqueue) } @@ -173,8 +180,12 @@ func (t *sequentialTaskProcessorImpl) processTaskQueue(taskqueue SequentialTaskQ } func (t *sequentialTaskProcessorImpl) processTaskOnce(taskqueue SequentialTaskQueue) { + taskProcessingStart := time.Now() metricsTimer := t.metricsScope.StartTimer(metrics.SequentialTaskTaskProcessingLatency) - defer metricsTimer.Stop() + defer func() { + metricsTimer.Stop() + t.metricsScope.RecordHistogramDuration(metrics.SequentialTaskTaskProcessingLatencyHistogram, time.Since(taskProcessingStart)) + }() task := taskqueue.Remove() err := task.Execute() diff --git a/common/task/weighted_round_robin_task_scheduler.go b/common/task/weighted_round_robin_task_scheduler.go index b3a753f8cd4..7ea2f8dd7a4 100644 --- a/common/task/weighted_round_robin_task_scheduler.go +++ b/common/task/weighted_round_robin_task_scheduler.go @@ -125,8 +125,12 @@ func (w *weightedRoundRobinTaskSchedulerImpl[K, T]) Stop() { func (w *weightedRoundRobinTaskSchedulerImpl[K, T]) Submit(task T) error { w.metricsScope.IncCounter(metrics.PriorityTaskSubmitRequest) + submitStart := time.Now() sw := w.metricsScope.StartTimer(metrics.PriorityTaskSubmitLatency) - defer sw.Stop() + defer func() { + sw.Stop() + w.metricsScope.RecordHistogramDuration(metrics.PriorityTaskSubmitLatencyHistogram, time.Since(submitStart)) + }() if w.isStopped() { return ErrTaskSchedulerClosed diff --git a/service/frontend/api/handler.go b/service/frontend/api/handler.go index 581badf7669..e618dc15f79 100644 --- a/service/frontend/api/handler.go +++ b/service/frontend/api/handler.go @@ -1622,6 +1622,7 @@ func (wh *WorkflowHandler) StartWorkflowExecutionAsync( return nil, fmt.Errorf("failed to encode StartWorkflowExecutionAsyncRequest: %v", err) } scope.RecordTimer(metrics.AsyncRequestPayloadSize, time.Duration(len(payload))) + scope.IntExponentialHistogram(metrics.AsyncRequestPayloadSizeHistogram, len(payload)) // propagate the headers from the context to the message header := &shared.Header{ @@ -2247,6 +2248,7 @@ func (wh *WorkflowHandler) SignalWithStartWorkflowExecutionAsync( return nil, fmt.Errorf("failed to encode SignalWithStartWorkflowExecutionAsyncRequest: %v", err) } scope.RecordTimer(metrics.AsyncRequestPayloadSize, time.Duration(len(payload))) + scope.IntExponentialHistogram(metrics.AsyncRequestPayloadSizeHistogram, len(payload)) // propagate the headers from the context to the message header := &shared.Header{ diff --git a/service/frontend/templates/metered.tmpl b/service/frontend/templates/metered.tmpl index da3c23f64e4..91d411f3836 100644 --- a/service/frontend/templates/metered.tmpl +++ b/service/frontend/templates/metered.tmpl @@ -1,5 +1,6 @@ import ( "context" + "time" "github.com/uber/cadence/common/log" "github.com/uber/cadence/common/log/tag" @@ -85,17 +86,20 @@ func (h *{{$decorator}}) {{$method.Declaration}} { {{- if has $method.Name $taskListAPIs}} scope := common.NewPerTaskListScope({{(index $method.Params 1).Name}}.Domain, {{(index $method.Params 1).Name}}.TaskList.GetName(), {{(index $method.Params 1).Name}}.TaskList.GetKind(), h.metricsClient, {{$scope}}).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) }() scopePerDomain := h.metricsClient.Scope({{$scope}}).Tagged(append(metrics.GetContextTags(ctx), {{$domainMetricTag}})...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { swPerDomain.Stop(); scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) }() {{- else}} scope := h.metricsClient.Scope({{$scope}}).Tagged(append(metrics.GetContextTags(ctx), {{$domainMetricTag}})...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() {{- end}} logger := h.logger.WithTags(tags...) diff --git a/service/frontend/wrappers/accesscontrolled/access_controlled.go b/service/frontend/wrappers/accesscontrolled/access_controlled.go index 56d68d7734d..44af8761a6d 100644 --- a/service/frontend/wrappers/accesscontrolled/access_controlled.go +++ b/service/frontend/wrappers/accesscontrolled/access_controlled.go @@ -24,6 +24,7 @@ package accesscontrolled import ( "context" + "time" "github.com/uber/cadence/common/authorization" "github.com/uber/cadence/common/metrics" @@ -46,8 +47,12 @@ func (a *apiHandler) isAuthorized( attr *authorization.Attributes, scope metrics.Scope, ) (bool, error) { + authStart := time.Now() sw := scope.StartTimer(metrics.CadenceAuthorizationLatency) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceAuthorizationLatencyHistogram, time.Since(authStart)) + }() result, err := a.authorizer.Authorize(ctx, attr) if err != nil { diff --git a/service/frontend/wrappers/accesscontrolled/access_controlled_test.go b/service/frontend/wrappers/accesscontrolled/access_controlled_test.go index b25f32a7450..2b97f19fa0d 100644 --- a/service/frontend/wrappers/accesscontrolled/access_controlled_test.go +++ b/service/frontend/wrappers/accesscontrolled/access_controlled_test.go @@ -28,6 +28,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "go.uber.org/mock/gomock" "github.com/uber/cadence/common/authorization" @@ -49,6 +50,7 @@ func TestIsAuthorized(t *testing.T) { mockSetup: func(authorizer *authorization.MockAuthorizer, scope *mocks.Scope) { authorizer.EXPECT().Authorize(gomock.Any(), gomock.Any()).Return(authorization.Result{Decision: authorization.DecisionAllow}, nil) scope.On("StartTimer", metrics.CadenceAuthorizationLatency).Return(metrics.NewTestStopwatch()).Once() + scope.On("RecordHistogramDuration", metrics.CadenceAuthorizationLatencyHistogram, mock.AnythingOfType("time.Duration")).Return().Once() }, isAuthorized: true, wantErr: false, @@ -58,6 +60,7 @@ func TestIsAuthorized(t *testing.T) { mockSetup: func(authorizer *authorization.MockAuthorizer, scope *mocks.Scope) { authorizer.EXPECT().Authorize(gomock.Any(), gomock.Any()).Return(authorization.Result{Decision: authorization.DecisionDeny}, nil) scope.On("StartTimer", metrics.CadenceAuthorizationLatency).Return(metrics.NewTestStopwatch()).Once() + scope.On("RecordHistogramDuration", metrics.CadenceAuthorizationLatencyHistogram, mock.AnythingOfType("time.Duration")).Return().Once() scope.On("IncCounter", metrics.CadenceErrUnauthorizedCounter).Return().Once() }, isAuthorized: false, @@ -68,6 +71,7 @@ func TestIsAuthorized(t *testing.T) { mockSetup: func(authorizer *authorization.MockAuthorizer, scope *mocks.Scope) { authorizer.EXPECT().Authorize(gomock.Any(), gomock.Any()).Return(authorization.Result{}, errors.New("some random error")) scope.On("StartTimer", metrics.CadenceAuthorizationLatency).Return(metrics.NewTestStopwatch()).Once() + scope.On("RecordHistogramDuration", metrics.CadenceAuthorizationLatencyHistogram, mock.AnythingOfType("time.Duration")).Return().Once() scope.On("IncCounter", metrics.CadenceErrAuthorizeFailedCounter).Return().Once() }, isAuthorized: false, diff --git a/service/frontend/wrappers/clusterredirection/callwrappers.go b/service/frontend/wrappers/clusterredirection/callwrappers.go index d5dda1f8edf..a31a38e78b5 100644 --- a/service/frontend/wrappers/clusterredirection/callwrappers.go +++ b/service/frontend/wrappers/clusterredirection/callwrappers.go @@ -60,7 +60,9 @@ func (handler *clusterRedirectionHandler) afterCall( scope = scope.Tagged(metrics.TargetClusterTag(cluster)) scope.IncCounter(metrics.CadenceDcRedirectionClientRequests) - scope.RecordTimer(metrics.CadenceDcRedirectionClientLatency, handler.GetTimeSource().Now().Sub(startTime)) + elapsed := handler.GetTimeSource().Now().Sub(startTime) + scope.RecordTimer(metrics.CadenceDcRedirectionClientLatency, elapsed) + scope.RecordHistogramDuration(metrics.CadenceDcRedirectionClientLatencyHistogram, elapsed) if *retError != nil { scope.IncCounter(metrics.CadenceDcRedirectionClientFailures) } diff --git a/service/frontend/wrappers/metered/api_generated.go b/service/frontend/wrappers/metered/api_generated.go index 1ee81bd4ae9..f559d739e72 100644 --- a/service/frontend/wrappers/metered/api_generated.go +++ b/service/frontend/wrappers/metered/api_generated.go @@ -6,6 +6,7 @@ package metered import ( "context" + "time" "github.com/uber/cadence/common" "github.com/uber/cadence/common/cache" @@ -45,8 +46,9 @@ func (h *apiHandler) BackfillSchedule(ctx context.Context, bp1 *types.BackfillSc tags = append(tags, toBackfillScheduleRequestTags(bp1)...) scope := h.metricsClient.Scope(metrics.FrontendBackfillScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(bp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) bp2, err = h.handler.BackfillSchedule(ctx, bp1) @@ -61,8 +63,9 @@ func (h *apiHandler) CountWorkflowExecutions(ctx context.Context, cp1 *types.Cou tags = append(tags, toCountWorkflowExecutionsRequestTags(cp1)...) scope := h.metricsClient.Scope(metrics.FrontendCountWorkflowExecutionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(cp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) cp2, err = h.handler.CountWorkflowExecutions(ctx, cp1) @@ -77,8 +80,9 @@ func (h *apiHandler) CreateSchedule(ctx context.Context, cp1 *types.CreateSchedu tags = append(tags, toCreateScheduleRequestTags(cp1)...) scope := h.metricsClient.Scope(metrics.FrontendCreateScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(cp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) cp2, err = h.handler.CreateSchedule(ctx, cp1) @@ -92,8 +96,9 @@ func (h *apiHandler) DeleteDomain(ctx context.Context, dp1 *types.DeleteDomainRe tags := []tag.Tag{tag.WorkflowHandlerName("DeleteDomain")} scope := h.metricsClient.Scope(metrics.FrontendDeleteDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.DeleteDomain(ctx, dp1) @@ -108,8 +113,9 @@ func (h *apiHandler) DeleteSchedule(ctx context.Context, dp1 *types.DeleteSchedu tags = append(tags, toDeleteScheduleRequestTags(dp1)...) scope := h.metricsClient.Scope(metrics.FrontendDeleteScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(dp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) dp2, err = h.handler.DeleteSchedule(ctx, dp1) @@ -123,8 +129,9 @@ func (h *apiHandler) DeprecateDomain(ctx context.Context, dp1 *types.DeprecateDo tags := []tag.Tag{tag.WorkflowHandlerName("DeprecateDomain")} scope := h.metricsClient.Scope(metrics.FrontendDeprecateDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.DeprecateDomain(ctx, dp1) @@ -138,8 +145,9 @@ func (h *apiHandler) DescribeDomain(ctx context.Context, dp1 *types.DescribeDoma tags := []tag.Tag{tag.WorkflowHandlerName("DescribeDomain")} scope := h.metricsClient.Scope(metrics.FrontendDescribeDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) dp2, err = h.handler.DescribeDomain(ctx, dp1) @@ -154,8 +162,9 @@ func (h *apiHandler) DescribeSchedule(ctx context.Context, dp1 *types.DescribeSc tags = append(tags, toDescribeScheduleRequestTags(dp1)...) scope := h.metricsClient.Scope(metrics.FrontendDescribeScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(dp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) dp2, err = h.handler.DescribeSchedule(ctx, dp1) @@ -170,12 +179,20 @@ func (h *apiHandler) DescribeTaskList(ctx context.Context, dp1 *types.DescribeTa tags = append(tags, toDescribeTaskListRequestTags(dp1)...) scope := common.NewPerTaskListScope(dp1.Domain, dp1.TaskList.GetName(), dp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendDescribeTaskListScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendDescribeTaskListScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(dp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) dp2, err = h.handler.DescribeTaskList(ctx, dp1) @@ -190,8 +207,9 @@ func (h *apiHandler) DescribeWorkflowExecution(ctx context.Context, dp1 *types.D tags = append(tags, toDescribeWorkflowExecutionRequestTags(dp1)...) scope := h.metricsClient.Scope(metrics.FrontendDescribeWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(dp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) dp2, err = h.handler.DescribeWorkflowExecution(ctx, dp1) @@ -206,8 +224,9 @@ func (h *apiHandler) DiagnoseWorkflowExecution(ctx context.Context, dp1 *types.D tags = append(tags, toDiagnoseWorkflowExecutionRequestTags(dp1)...) scope := h.metricsClient.Scope(metrics.FrontendDiagnoseWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(dp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) dp2, err = h.handler.DiagnoseWorkflowExecution(ctx, dp1) @@ -222,8 +241,9 @@ func (h *apiHandler) FailoverDomain(ctx context.Context, fp1 *types.FailoverDoma tags = append(tags, toFailoverDomainRequestTags(fp1)...) scope := h.metricsClient.Scope(metrics.FrontendFailoverDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(fp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) fp2, err = h.handler.FailoverDomain(ctx, fp1) @@ -237,8 +257,9 @@ func (h *apiHandler) GetClusterInfo(ctx context.Context) (cp1 *types.ClusterInfo tags := []tag.Tag{tag.WorkflowHandlerName("GetClusterInfo")} scope := h.metricsClient.Scope(metrics.FrontendGetClusterInfoScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) cp1, err = h.handler.GetClusterInfo(ctx) @@ -252,8 +273,9 @@ func (h *apiHandler) GetSearchAttributes(ctx context.Context) (gp1 *types.GetSea tags := []tag.Tag{tag.WorkflowHandlerName("GetSearchAttributes")} scope := h.metricsClient.Scope(metrics.FrontendGetSearchAttributesScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) gp1, err = h.handler.GetSearchAttributes(ctx) @@ -268,8 +290,9 @@ func (h *apiHandler) GetTaskListsByDomain(ctx context.Context, gp1 *types.GetTas tags = append(tags, toGetTaskListsByDomainRequestTags(gp1)...) scope := h.metricsClient.Scope(metrics.FrontendGetTaskListsByDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(gp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) gp2, err = h.handler.GetTaskListsByDomain(ctx, gp1) @@ -284,8 +307,9 @@ func (h *apiHandler) GetWorkflowExecutionHistory(ctx context.Context, gp1 *types tags = append(tags, toGetWorkflowExecutionHistoryRequestTags(gp1)...) scope := h.metricsClient.Scope(metrics.FrontendGetWorkflowExecutionHistoryScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(gp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) gp2, err = h.handler.GetWorkflowExecutionHistory(ctx, gp1) @@ -303,8 +327,9 @@ func (h *apiHandler) ListArchivedWorkflowExecutions(ctx context.Context, lp1 *ty tags = append(tags, toListArchivedWorkflowExecutionsRequestTags(lp1)...) scope := h.metricsClient.Scope(metrics.FrontendListArchivedWorkflowExecutionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListArchivedWorkflowExecutions(ctx, lp1) @@ -319,8 +344,9 @@ func (h *apiHandler) ListClosedWorkflowExecutions(ctx context.Context, lp1 *type tags = append(tags, toListClosedWorkflowExecutionsRequestTags(lp1)...) scope := h.metricsClient.Scope(metrics.FrontendListClosedWorkflowExecutionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListClosedWorkflowExecutions(ctx, lp1) @@ -334,8 +360,9 @@ func (h *apiHandler) ListDomains(ctx context.Context, lp1 *types.ListDomainsRequ tags := []tag.Tag{tag.WorkflowHandlerName("ListDomains")} scope := h.metricsClient.Scope(metrics.FrontendListDomainsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListDomains(ctx, lp1) @@ -349,8 +376,9 @@ func (h *apiHandler) ListFailoverHistory(ctx context.Context, lp1 *types.ListFai tags := []tag.Tag{tag.WorkflowHandlerName("ListFailoverHistory")} scope := h.metricsClient.Scope(metrics.FrontendListFailoverHistoryScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListFailoverHistory(ctx, lp1) @@ -365,8 +393,9 @@ func (h *apiHandler) ListOpenWorkflowExecutions(ctx context.Context, lp1 *types. tags = append(tags, toListOpenWorkflowExecutionsRequestTags(lp1)...) scope := h.metricsClient.Scope(metrics.FrontendListOpenWorkflowExecutionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListOpenWorkflowExecutions(ctx, lp1) @@ -381,8 +410,9 @@ func (h *apiHandler) ListSchedules(ctx context.Context, lp1 *types.ListSchedules tags = append(tags, toListSchedulesRequestTags(lp1)...) scope := h.metricsClient.Scope(metrics.FrontendListSchedulesScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListSchedules(ctx, lp1) @@ -397,12 +427,20 @@ func (h *apiHandler) ListTaskListPartitions(ctx context.Context, lp1 *types.List tags = append(tags, toListTaskListPartitionsRequestTags(lp1)...) scope := common.NewPerTaskListScope(lp1.Domain, lp1.TaskList.GetName(), lp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendListTaskListPartitionsScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendListTaskListPartitionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListTaskListPartitions(ctx, lp1) @@ -417,8 +455,9 @@ func (h *apiHandler) ListWorkflowExecutions(ctx context.Context, lp1 *types.List tags = append(tags, toListWorkflowExecutionsRequestTags(lp1)...) scope := h.metricsClient.Scope(metrics.FrontendListWorkflowExecutionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ListWorkflowExecutions(ctx, lp1) @@ -433,8 +472,9 @@ func (h *apiHandler) PauseSchedule(ctx context.Context, pp1 *types.PauseSchedule tags = append(tags, toPauseScheduleRequestTags(pp1)...) scope := h.metricsClient.Scope(metrics.FrontendPauseScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(pp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) pp2, err = h.handler.PauseSchedule(ctx, pp1) @@ -449,12 +489,20 @@ func (h *apiHandler) PollForActivityTask(ctx context.Context, pp1 *types.PollFor tags = append(tags, toPollForActivityTaskRequestTags(pp1)...) scope := common.NewPerTaskListScope(pp1.Domain, pp1.TaskList.GetName(), pp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendPollForActivityTaskScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendPollForActivityTaskScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(pp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) pp2, err = h.handler.PollForActivityTask(ctx, pp1) @@ -469,12 +517,20 @@ func (h *apiHandler) PollForDecisionTask(ctx context.Context, pp1 *types.PollFor tags = append(tags, toPollForDecisionTaskRequestTags(pp1)...) scope := common.NewPerTaskListScope(pp1.Domain, pp1.TaskList.GetName(), pp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendPollForDecisionTaskScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendPollForDecisionTaskScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(pp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) pp2, err = h.handler.PollForDecisionTask(ctx, pp1) @@ -489,8 +545,9 @@ func (h *apiHandler) QueryWorkflow(ctx context.Context, qp1 *types.QueryWorkflow tags = append(tags, toQueryWorkflowRequestTags(qp1)...) scope := h.metricsClient.Scope(metrics.FrontendQueryWorkflowScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(qp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) qp2, err = h.handler.QueryWorkflow(ctx, qp1) @@ -513,8 +570,9 @@ func (h *apiHandler) RecordActivityTaskHeartbeat(ctx context.Context, rp1 *types tags = append(tags, tag.WorkflowDomainName(domainName), tag.WorkflowID(token.WorkflowID), tag.WorkflowRunID(token.RunID)) scope := h.metricsClient.Scope(metrics.FrontendRecordActivityTaskHeartbeatScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) rp2, err = h.handler.RecordActivityTaskHeartbeat(ctx, rp1) @@ -529,8 +587,9 @@ func (h *apiHandler) RecordActivityTaskHeartbeatByID(ctx context.Context, rp1 *t tags = append(tags, toRecordActivityTaskHeartbeatByIDRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRecordActivityTaskHeartbeatByIDScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) rp2, err = h.handler.RecordActivityTaskHeartbeatByID(ctx, rp1) @@ -545,8 +604,9 @@ func (h *apiHandler) RefreshWorkflowTasks(ctx context.Context, rp1 *types.Refres tags = append(tags, toRefreshWorkflowTasksRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRefreshWorkflowTasksScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RefreshWorkflowTasks(ctx, rp1) @@ -560,8 +620,9 @@ func (h *apiHandler) RegisterDomain(ctx context.Context, rp1 *types.RegisterDoma tags := []tag.Tag{tag.WorkflowHandlerName("RegisterDomain")} scope := h.metricsClient.Scope(metrics.FrontendRegisterDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RegisterDomain(ctx, rp1) @@ -576,8 +637,9 @@ func (h *apiHandler) RequestCancelWorkflowExecution(ctx context.Context, rp1 *ty tags = append(tags, toRequestCancelWorkflowExecutionRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRequestCancelWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RequestCancelWorkflowExecution(ctx, rp1) @@ -592,8 +654,9 @@ func (h *apiHandler) ResetStickyTaskList(ctx context.Context, rp1 *types.ResetSt tags = append(tags, toResetStickyTaskListRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendResetStickyTaskListScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) rp2, err = h.handler.ResetStickyTaskList(ctx, rp1) @@ -608,8 +671,9 @@ func (h *apiHandler) ResetWorkflowExecution(ctx context.Context, rp1 *types.Rese tags = append(tags, toResetWorkflowExecutionRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendResetWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) rp2, err = h.handler.ResetWorkflowExecution(ctx, rp1) @@ -632,8 +696,9 @@ func (h *apiHandler) RespondActivityTaskCanceled(ctx context.Context, rp1 *types tags = append(tags, tag.WorkflowDomainName(domainName), tag.WorkflowID(token.WorkflowID), tag.WorkflowRunID(token.RunID)) scope := h.metricsClient.Scope(metrics.FrontendRespondActivityTaskCanceledScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondActivityTaskCanceled(ctx, rp1) @@ -648,8 +713,9 @@ func (h *apiHandler) RespondActivityTaskCanceledByID(ctx context.Context, rp1 *t tags = append(tags, toRespondActivityTaskCanceledByIDRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRespondActivityTaskCanceledByIDScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondActivityTaskCanceledByID(ctx, rp1) @@ -672,8 +738,9 @@ func (h *apiHandler) RespondActivityTaskCompleted(ctx context.Context, rp1 *type tags = append(tags, tag.WorkflowDomainName(domainName), tag.WorkflowID(token.WorkflowID), tag.WorkflowRunID(token.RunID)) scope := h.metricsClient.Scope(metrics.FrontendRespondActivityTaskCompletedScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondActivityTaskCompleted(ctx, rp1) @@ -688,8 +755,9 @@ func (h *apiHandler) RespondActivityTaskCompletedByID(ctx context.Context, rp1 * tags = append(tags, toRespondActivityTaskCompletedByIDRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRespondActivityTaskCompletedByIDScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondActivityTaskCompletedByID(ctx, rp1) @@ -712,8 +780,9 @@ func (h *apiHandler) RespondActivityTaskFailed(ctx context.Context, rp1 *types.R tags = append(tags, tag.WorkflowDomainName(domainName), tag.WorkflowID(token.WorkflowID), tag.WorkflowRunID(token.RunID)) scope := h.metricsClient.Scope(metrics.FrontendRespondActivityTaskFailedScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondActivityTaskFailed(ctx, rp1) @@ -728,8 +797,9 @@ func (h *apiHandler) RespondActivityTaskFailedByID(ctx context.Context, rp1 *typ tags = append(tags, toRespondActivityTaskFailedByIDRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRespondActivityTaskFailedByIDScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondActivityTaskFailedByID(ctx, rp1) @@ -752,8 +822,9 @@ func (h *apiHandler) RespondDecisionTaskCompleted(ctx context.Context, rp1 *type tags = append(tags, tag.WorkflowDomainName(domainName), tag.WorkflowID(token.WorkflowID), tag.WorkflowRunID(token.RunID)) scope := h.metricsClient.Scope(metrics.FrontendRespondDecisionTaskCompletedScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) rp2, err = h.handler.RespondDecisionTaskCompleted(ctx, rp1) @@ -776,8 +847,9 @@ func (h *apiHandler) RespondDecisionTaskFailed(ctx context.Context, rp1 *types.R tags = append(tags, tag.WorkflowDomainName(domainName), tag.WorkflowID(token.WorkflowID), tag.WorkflowRunID(token.RunID)) scope := h.metricsClient.Scope(metrics.FrontendRespondDecisionTaskFailedScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondDecisionTaskFailed(ctx, rp1) @@ -800,8 +872,9 @@ func (h *apiHandler) RespondQueryTaskCompleted(ctx context.Context, rp1 *types.R tags = append(tags, tag.WorkflowDomainName(domainName)) scope := h.metricsClient.Scope(metrics.FrontendRespondQueryTaskCompletedScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(domainName))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.RespondQueryTaskCompleted(ctx, rp1) @@ -816,8 +889,9 @@ func (h *apiHandler) RestartWorkflowExecution(ctx context.Context, rp1 *types.Re tags = append(tags, toRestartWorkflowExecutionRequestTags(rp1)...) scope := h.metricsClient.Scope(metrics.FrontendRestartWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(rp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) rp2, err = h.handler.RestartWorkflowExecution(ctx, rp1) @@ -832,8 +906,9 @@ func (h *apiHandler) ScanWorkflowExecutions(ctx context.Context, lp1 *types.List tags = append(tags, toScanWorkflowExecutionsRequestTags(lp1)...) scope := h.metricsClient.Scope(metrics.FrontendScanWorkflowExecutionsScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(lp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) lp2, err = h.handler.ScanWorkflowExecutions(ctx, lp1) @@ -848,12 +923,20 @@ func (h *apiHandler) SignalWithStartWorkflowExecution(ctx context.Context, sp1 * tags = append(tags, toSignalWithStartWorkflowExecutionRequestTags(sp1)...) scope := common.NewPerTaskListScope(sp1.Domain, sp1.TaskList.GetName(), sp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendSignalWithStartWorkflowExecutionScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendSignalWithStartWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(sp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) sp2, err = h.handler.SignalWithStartWorkflowExecution(ctx, sp1) @@ -868,12 +951,20 @@ func (h *apiHandler) SignalWithStartWorkflowExecutionAsync(ctx context.Context, tags = append(tags, toSignalWithStartWorkflowExecutionAsyncRequestTags(sp1)...) scope := common.NewPerTaskListScope(sp1.Domain, sp1.TaskList.GetName(), sp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendSignalWithStartWorkflowExecutionAsyncScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendSignalWithStartWorkflowExecutionAsyncScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(sp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) sp2, err = h.handler.SignalWithStartWorkflowExecutionAsync(ctx, sp1) @@ -889,8 +980,9 @@ func (h *apiHandler) SignalWorkflowExecution(ctx context.Context, sp1 *types.Sig tags = append(tags, toSignalWorkflowExecutionRequestTags(sp1)...) scope := h.metricsClient.Scope(metrics.FrontendSignalWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(sp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.SignalWorkflowExecution(ctx, sp1) @@ -905,12 +997,20 @@ func (h *apiHandler) StartWorkflowExecution(ctx context.Context, sp1 *types.Star tags = append(tags, toStartWorkflowExecutionRequestTags(sp1)...) scope := common.NewPerTaskListScope(sp1.Domain, sp1.TaskList.GetName(), sp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendStartWorkflowExecutionScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendStartWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(sp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) sp2, err = h.handler.StartWorkflowExecution(ctx, sp1) @@ -925,12 +1025,20 @@ func (h *apiHandler) StartWorkflowExecutionAsync(ctx context.Context, sp1 *types tags = append(tags, toStartWorkflowExecutionAsyncRequestTags(sp1)...) scope := common.NewPerTaskListScope(sp1.Domain, sp1.TaskList.GetName(), sp1.TaskList.GetKind(), h.metricsClient, metrics.FrontendStartWorkflowExecutionAsyncScope).Tagged(metrics.GetContextTags(ctx)...) scope.IncCounter(metrics.CadenceRequestsPerTaskListWithoutRollup) + swPerTLStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatencyPerTaskList) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swPerTLStart)) + }() scopePerDomain := h.metricsClient.Scope(metrics.FrontendStartWorkflowExecutionAsyncScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(sp1.GetDomain()))...) scopePerDomain.IncCounter(metrics.CadenceRequests) + swPerDomainStart := time.Now() swPerDomain := scopePerDomain.StartTimer(metrics.CadenceLatency) - defer swPerDomain.Stop() + defer func() { + swPerDomain.Stop() + scopePerDomain.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swPerDomainStart)) + }() logger := h.logger.WithTags(tags...) sp2, err = h.handler.StartWorkflowExecutionAsync(ctx, sp1) @@ -945,8 +1053,9 @@ func (h *apiHandler) TerminateWorkflowExecution(ctx context.Context, tp1 *types. tags = append(tags, toTerminateWorkflowExecutionRequestTags(tp1)...) scope := h.metricsClient.Scope(metrics.FrontendTerminateWorkflowExecutionScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(tp1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) err = h.handler.TerminateWorkflowExecution(ctx, tp1) @@ -961,8 +1070,9 @@ func (h *apiHandler) UnpauseSchedule(ctx context.Context, up1 *types.UnpauseSche tags = append(tags, toUnpauseScheduleRequestTags(up1)...) scope := h.metricsClient.Scope(metrics.FrontendUnpauseScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(up1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) up2, err = h.handler.UnpauseSchedule(ctx, up1) @@ -976,8 +1086,9 @@ func (h *apiHandler) UpdateDomain(ctx context.Context, up1 *types.UpdateDomainRe tags := []tag.Tag{tag.WorkflowHandlerName("UpdateDomain")} scope := h.metricsClient.Scope(metrics.FrontendUpdateDomainScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainUnknownTag())...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) up2, err = h.handler.UpdateDomain(ctx, up1) @@ -992,8 +1103,9 @@ func (h *apiHandler) UpdateSchedule(ctx context.Context, up1 *types.UpdateSchedu tags = append(tags, toUpdateScheduleRequestTags(up1)...) scope := h.metricsClient.Scope(metrics.FrontendUpdateScheduleScope).Tagged(append(metrics.GetContextTags(ctx), metrics.DomainTag(up1.GetDomain()))...) scope.IncCounter(metrics.CadenceRequests) + swStart := time.Now() sw := scope.StartTimer(metrics.CadenceLatency) - defer sw.Stop() + defer func() { sw.Stop(); scope.RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(swStart)) }() logger := h.logger.WithTags(tags...) up2, err = h.handler.UpdateSchedule(ctx, up1) diff --git a/service/history/engine/engineimpl/get_replication_messages.go b/service/history/engine/engineimpl/get_replication_messages.go index fda5362982f..63d9ee48c19 100644 --- a/service/history/engine/engineimpl/get_replication_messages.go +++ b/service/history/engine/engineimpl/get_replication_messages.go @@ -25,6 +25,7 @@ import ( "context" "encoding/json" "fmt" + "time" "github.com/uber/cadence/common" "github.com/uber/cadence/common/log/tag" @@ -40,8 +41,12 @@ func (e *historyEngineImpl) GetReplicationMessages( ) (*types.ReplicationMessages, error) { scope := metrics.HistoryGetReplicationMessagesScope + replMsgStart := time.Now() sw := e.metricsClient.StartTimer(scope, metrics.GetReplicationMessagesForShardLatency) - defer sw.Stop() + defer func() { + sw.Stop() + e.metricsClient.Scope(scope).RecordHistogramDuration(metrics.GetReplicationMessagesForShardLatencyHistogram, time.Since(replMsgStart)) + }() replicationMessages, err := e.replicationAckManager.GetTasks( ctx, @@ -78,8 +83,12 @@ func (e *historyEngineImpl) GetDLQReplicationMessages( ) ([]*types.ReplicationTask, error) { scope := metrics.HistoryGetDLQReplicationMessagesScope + dlqStart := time.Now() sw := e.metricsClient.StartTimer(scope, metrics.GetDLQReplicationMessagesLatency) - defer sw.Stop() + defer func() { + sw.Stop() + e.metricsClient.Scope(scope).RecordHistogramDuration(metrics.GetDLQReplicationMessagesLatencyHistogram, time.Since(dlqStart)) + }() tasks := make([]*types.ReplicationTask, 0, len(taskInfos)) for _, taskInfo := range taskInfos { diff --git a/service/history/engine/engineimpl/query_workflow.go b/service/history/engine/engineimpl/query_workflow.go index f03c4932e32..24e2894c5db 100644 --- a/service/history/engine/engineimpl/query_workflow.go +++ b/service/history/engine/engineimpl/query_workflow.go @@ -158,8 +158,12 @@ func (e *historyEngineImpl) QueryWorkflow( // If we get here it means query could not be dispatched through matching directly, so it must block // until either an result has been obtained on a decision task response or until it is safe to dispatch directly through matching. + decisionQueryStart := time.Now() sw := scope.StartTimer(metrics.DecisionTaskQueryLatency) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.DecisionTaskQueryLatencyHistogram, time.Since(decisionQueryStart)) + }() queryReg := mutableState.GetQueryRegistry() if len(queryReg.GetBufferedIDs()) >= e.config.MaxBufferedQueryCount() { scope.IncCounter(metrics.QueryBufferExceededCount) diff --git a/service/history/execution/context.go b/service/history/execution/context.go index 667f988081d..d5ec67d7b40 100644 --- a/service/history/execution/context.go +++ b/service/history/execution/context.go @@ -295,6 +295,7 @@ func (c *contextImpl) Unlock() { } elapsed := time.Since(c.lockTime) c.metricsClient.RecordTimer(metrics.WorkflowContextScope, metrics.WorkflowContextLockLatency, elapsed) + c.metricsClient.Scope(metrics.WorkflowContextScope).RecordHistogramDuration(metrics.WorkflowContextLockLatencyHistogram, elapsed) if elapsed > c.maxLockDuration { c.maxLockDuration = elapsed c.logger.Info("workflow context lock is released. this is logged only when it's longer than maxLockDuration", tag.WorkflowContextLockLatency(elapsed)) diff --git a/service/history/failover/coordinator.go b/service/history/failover/coordinator.go index 2a15015927b..589146c72b2 100644 --- a/service/history/failover/coordinator.go +++ b/service/history/failover/coordinator.go @@ -327,6 +327,12 @@ func (c *coordinatorImpl) handleFailoverMarkers( metrics.GracefulFailoverLatency, now.Sub(time.Unix(0, marker.GetCreationTime())), ) + c.scope.Tagged( + metrics.DomainTag(domainName), + ).RecordHistogramDuration( + metrics.GracefulFailoverLatencyHistogram, + now.Sub(time.Unix(0, marker.GetCreationTime())), + ) c.logger.Info("Updated domain from pending-active to active", tag.WorkflowDomainName(domainName), tag.FailoverVersion(marker.FailoverVersion), diff --git a/service/history/replication/task_executor.go b/service/history/replication/task_executor.go index a668901b20b..b355dd21e22 100644 --- a/service/history/replication/task_executor.go +++ b/service/history/replication/task_executor.go @@ -22,6 +22,7 @@ package replication import ( "context" + "time" "github.com/uber/cadence/common" "github.com/uber/cadence/common/cache" @@ -130,8 +131,12 @@ func (e *taskExecutorImpl) handleActivityTask( return err } + replicationLatencyStart := time.Now() replicationStopWatch := e.metricsClient.StartTimer(metrics.SyncActivityTaskScope, metrics.CadenceLatency) - defer replicationStopWatch.Stop() + defer func() { + replicationStopWatch.Stop() + e.metricsClient.Scope(metrics.SyncActivityTaskScope).RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(replicationLatencyStart)) + }() request := &types.SyncActivityRequest{ DomainID: attr.DomainID, WorkflowID: attr.WorkflowID, @@ -171,8 +176,12 @@ func (e *taskExecutorImpl) handleActivityTask( } // Handle resend error e.metricsClient.IncCounter(metrics.HistoryRereplicationByActivityReplicationScope, metrics.CadenceClientRequests) + activityResendLatencyStart := time.Now() stopwatch := e.metricsClient.StartTimer(metrics.HistoryRereplicationByActivityReplicationScope, metrics.CadenceClientLatency) - defer stopwatch.Stop() + defer func() { + stopwatch.Stop() + e.metricsClient.Scope(metrics.HistoryRereplicationByActivityReplicationScope).RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(activityResendLatencyStart)) + }() resendErr := e.historyResender.SendSingleWorkflowHistory( e.sourceCluster, @@ -234,8 +243,12 @@ func (e *taskExecutorImpl) handleHistoryReplicationTaskV2( return err } + replicationV2LatencyStart := time.Now() replicationStopWatch := e.metricsClient.StartTimer(metrics.HistoryReplicationV2TaskScope, metrics.CadenceLatency) - defer replicationStopWatch.Stop() + defer func() { + replicationStopWatch.Stop() + e.metricsClient.Scope(metrics.HistoryReplicationV2TaskScope).RecordHistogramDuration(metrics.CadenceLatencyHistogram, time.Since(replicationV2LatencyStart)) + }() request := &types.ReplicateEventsV2Request{ DomainUUID: attr.DomainID, WorkflowExecution: &types.WorkflowExecution{ @@ -268,8 +281,12 @@ func (e *taskExecutorImpl) handleHistoryReplicationTaskV2( return err } e.metricsClient.IncCounter(metrics.HistoryRereplicationByHistoryReplicationScope, metrics.CadenceClientRequests) + historyResendLatencyStart := time.Now() resendStopWatch := e.metricsClient.StartTimer(metrics.HistoryRereplicationByHistoryReplicationScope, metrics.CadenceClientLatency) - defer resendStopWatch.Stop() + defer func() { + resendStopWatch.Stop() + e.metricsClient.Scope(metrics.HistoryRereplicationByHistoryReplicationScope).RecordHistogramDuration(metrics.CadenceClientLatencyHistogram, time.Since(historyResendLatencyStart)) + }() resendErr := e.historyResender.SendSingleWorkflowHistory( e.sourceCluster, diff --git a/service/history/task/redispatcher.go b/service/history/task/redispatcher.go index 344d9d39922..0cfc0e6c2ff 100644 --- a/service/history/task/redispatcher.go +++ b/service/history/task/redispatcher.go @@ -213,6 +213,7 @@ func (r *redispatcherImpl) redispatchTasks(notification redispatchNotification) queueSize := r.sizeLocked() r.metricsScope.RecordTimer(metrics.TaskRedispatchQueuePendingTasksTimer, time.Duration(queueSize)) + r.metricsScope.IntExponentialHistogram(metrics.TaskRedispatchQueuePendingTasksHistogram, queueSize) // add some buffer here as new tasks may be added targetRedispatched := queueSize + defaultBufferSize - notification.targetSize diff --git a/service/matching/handler/context.go b/service/matching/handler/context.go index 7fad39797b3..615c26026db 100644 --- a/service/matching/handler/context.go +++ b/service/matching/handler/context.go @@ -24,6 +24,7 @@ import ( "context" "errors" "sync" + "time" "github.com/uber/cadence/common" cadence_errors "github.com/uber/cadence/common/errors" @@ -55,11 +56,12 @@ func newHandlerContext( } // startProfiling initiates recording of request metrics -func (reqCtx *handlerContext) startProfiling(wg *sync.WaitGroup) metrics.Stopwatch { +func (reqCtx *handlerContext) startProfiling(wg *sync.WaitGroup) (metrics.Stopwatch, time.Time) { wg.Wait() + start := time.Now() sw := reqCtx.scope.StartTimer(metrics.CadenceLatencyPerTaskList) reqCtx.scope.IncCounter(metrics.CadenceRequestsPerTaskList) - return sw + return sw, start } func (reqCtx *handlerContext) handleErr(err error) error { diff --git a/service/matching/handler/engine.go b/service/matching/handler/engine.go index 17806eb5863..6ad3b582075 100644 --- a/service/matching/handler/engine.go +++ b/service/matching/handler/engine.go @@ -452,6 +452,7 @@ func (e *matchingEngineImpl) AddDecisionTask( } if syncMatched { hCtx.scope.RecordTimer(metrics.SyncMatchLatencyPerTaskList, time.Since(startT)) + hCtx.scope.RecordHistogramDuration(metrics.SyncMatchLatencyPerTaskListHistogram, time.Since(startT)) } return &types.AddDecisionTaskResponse{ PartitionConfig: tlMgr.TaskListPartitionConfig(), @@ -528,6 +529,7 @@ func (e *matchingEngineImpl) AddActivityTask( } if syncMatched { hCtx.scope.RecordTimer(metrics.SyncMatchLatencyPerTaskList, time.Since(startT)) + hCtx.scope.RecordHistogramDuration(metrics.SyncMatchLatencyPerTaskListHistogram, time.Since(startT)) } return &types.AddActivityTaskResponse{ PartitionConfig: tlMgr.TaskListPartitionConfig(), @@ -1296,6 +1298,7 @@ func (e *matchingEngineImpl) createPollForDecisionTaskResponse( token, _ = e.tokenSerializer.Serialize(taskToken) if task.ResponseC == nil { scope.RecordTimer(metrics.AsyncMatchLatencyPerTaskList, time.Since(task.Event.CreatedTime)) + scope.RecordHistogramDuration(metrics.AsyncMatchLatencyPerTaskListHistogram, time.Since(task.Event.CreatedTime)) } } @@ -1329,6 +1332,7 @@ func (e *matchingEngineImpl) createPollForActivityTaskResponse( } if task.ResponseC == nil { scope.RecordTimer(metrics.AsyncMatchLatencyPerTaskList, time.Since(task.Event.CreatedTime)) + scope.RecordHistogramDuration(metrics.AsyncMatchLatencyPerTaskListHistogram, time.Since(task.Event.CreatedTime)) } response := &types.MatchingPollForActivityTaskResponse{} diff --git a/service/matching/handler/handler.go b/service/matching/handler/handler.go index ce174392462..a6969806275 100644 --- a/service/matching/handler/handler.go +++ b/service/matching/handler/handler.go @@ -23,6 +23,7 @@ package handler import ( "context" "sync" + "time" "github.com/uber/cadence/common" "github.com/uber/cadence/common/cache" @@ -139,8 +140,11 @@ func (h *handlerImpl) AddActivityTask( metrics.MatchingAddActivityTaskScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if request.GetForwardedFrom() != "" { hCtx.scope.IncCounter(metrics.ForwardedPerTaskListCounter) @@ -169,8 +173,11 @@ func (h *handlerImpl) AddDecisionTask( metrics.MatchingAddDecisionTaskScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if request.GetForwardedFrom() != "" { hCtx.scope.IncCounter(metrics.ForwardedPerTaskListCounter) @@ -199,8 +206,11 @@ func (h *handlerImpl) PollForActivityTask( metrics.MatchingPollForActivityTaskScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if request.GetForwardedFrom() != "" { hCtx.scope.IncCounter(metrics.ForwardedPerTaskListCounter) @@ -236,8 +246,11 @@ func (h *handlerImpl) PollForDecisionTask( metrics.MatchingPollForDecisionTaskScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if request.GetForwardedFrom() != "" { hCtx.scope.IncCounter(metrics.ForwardedPerTaskListCounter) @@ -274,8 +287,11 @@ func (h *handlerImpl) QueryWorkflow( metrics.MatchingQueryWorkflowScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if request.GetForwardedFrom() != "" { hCtx.scope.IncCounter(metrics.ForwardedPerTaskListCounter) @@ -304,8 +320,11 @@ func (h *handlerImpl) RespondQueryTaskCompleted( metrics.MatchingRespondQueryTaskCompletedScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() // Count the request in the RPS, but we still accept it even if RPS is exceeded h.workerRateLimiter.Allow(quotas.Info{Domain: domainName}) @@ -327,8 +346,11 @@ func (h *handlerImpl) CancelOutstandingPoll(ctx context.Context, metrics.MatchingCancelOutstandingPollScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() // Count the request in the RPS, but we still accept it even if RPS is exceeded h.workerRateLimiter.Allow(quotas.Info{Domain: domainName}) @@ -354,8 +376,11 @@ func (h *handlerImpl) DescribeTaskList( metrics.MatchingDescribeTaskListScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if ok := h.userRateLimiter.Allow(quotas.Info{Domain: domainName}); !ok { return nil, hCtx.handleErr(errMatchingHostThrottle) @@ -381,8 +406,11 @@ func (h *handlerImpl) ListTaskListPartitions( h.logger, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if ok := h.userRateLimiter.Allow(quotas.Info{Domain: request.GetDomain()}); !ok { return nil, hCtx.handleErr(errMatchingHostThrottle) @@ -408,8 +436,11 @@ func (h *handlerImpl) GetTaskListsByDomain( h.logger, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if ok := h.userRateLimiter.Allow(quotas.Info{Domain: request.GetDomain()}); !ok { return nil, hCtx.handleErr(errMatchingHostThrottle) @@ -433,8 +464,11 @@ func (h *handlerImpl) UpdateTaskListPartitionConfig( metrics.MatchingUpdateTaskListPartitionConfigScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() if ok := h.userRateLimiter.Allow(quotas.Info{Domain: domainName}); !ok { return nil, hCtx.handleErr(errMatchingHostThrottle) @@ -458,8 +492,11 @@ func (h *handlerImpl) RefreshTaskListPartitionConfig( metrics.MatchingRefreshTaskListPartitionConfigScope, ) - sw := hCtx.startProfiling(&h.startWG) - defer sw.Stop() + sw, swStart := hCtx.startProfiling(&h.startWG) + defer func() { + sw.Stop() + hCtx.scope.RecordHistogramDuration(metrics.CadenceLatencyPerTaskListHistogram, time.Since(swStart)) + }() // Count the request in the RPS, but we still accept it even if RPS is exceeded h.userRateLimiter.Allow(quotas.Info{Domain: domainName}) diff --git a/service/matching/tasklist/matcher.go b/service/matching/tasklist/matcher.go index 2e4542400c0..e6ccff609fb 100644 --- a/service/matching/tasklist/matcher.go +++ b/service/matching/tasklist/matcher.go @@ -346,6 +346,7 @@ forLoop: event.Log(e) tm.scope.IncCounter(metrics.AsyncMatchLocalPollCounterPerTaskList) tm.scope.RecordTimer(metrics.AsyncMatchLocalPollAttemptPerTaskList, time.Duration(attempt)) + tm.scope.IntExponentialHistogram(metrics.AsyncMatchLocalPollAttemptPerTaskListHistogram, attempt) tm.scope.RecordTimer(metrics.AsyncMatchLocalPollLatencyPerTaskList, time.Since(startT)) return nil case token := <-tm.fwdrAddReqTokenC(): @@ -377,6 +378,7 @@ forLoop: cancel() tm.scope.IncCounter(metrics.AsyncMatchLocalPollAfterForwardFailedCounterPerTaskList) tm.scope.RecordTimer(metrics.AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskList, time.Duration(attempt)) + tm.scope.IntExponentialHistogram(metrics.AsyncMatchLocalPollAfterForwardFailedAttemptPerTaskListHistogram, attempt) tm.scope.RecordTimer(metrics.AsyncMatchLocalPollAfterForwardFailedLatencyPerTaskList, time.Since(startT)) return nil case <-childCtx.Done(): @@ -391,6 +393,7 @@ forLoop: event.Log(e) tm.scope.IncCounter(metrics.AsyncMatchForwardPollCounterPerTaskList) tm.scope.RecordTimer(metrics.AsyncMatchForwardPollAttemptPerTaskList, time.Duration(attempt)) + tm.scope.IntExponentialHistogram(metrics.AsyncMatchForwardPollAttemptPerTaskListHistogram, attempt) tm.scope.RecordTimer(metrics.AsyncMatchForwardPollLatencyPerTaskList, time.Since(startT)) // at this point, we forwarded the task to a parent partition which @@ -438,6 +441,7 @@ func (tm *taskMatcherImpl) Poll(ctx context.Context, isolationGroup string) (*In // try local match first without blocking until context timeout if task, err = tm.pollNonBlocking(ctxWithCancelPropagation, isolatedTaskC, tm.taskC, tm.queryTaskC); err == nil { tm.scope.RecordTimer(metrics.PollLocalMatchLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollLocalMatchLatencyPerTaskListHistogram, time.Since(startT)) return task, nil } // there is no local poller available to pickup this task. Now block waiting @@ -466,6 +470,7 @@ func (tm *taskMatcherImpl) PollForQuery(ctx context.Context) (*InternalTask, err // try local match first without blocking until context timeout if task, err := tm.pollNonBlocking(ctx, nil, nil, tm.queryTaskC); err == nil { tm.scope.RecordTimer(metrics.PollLocalMatchLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollLocalMatchLatencyPerTaskListHistogram, time.Since(startT)) return task, nil } @@ -498,6 +503,7 @@ func (tm *taskMatcherImpl) pollOrForward( tm.scope.IncCounter(metrics.PollSuccessWithSyncPerTaskListCounter) } tm.scope.RecordTimer(metrics.PollLocalMatchLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollLocalMatchLatencyPerTaskListHistogram, time.Since(startT)) tm.scope.IncCounter(metrics.PollSuccessPerTaskListCounter) event.Log(event.E{ TaskListName: tm.tasklist.GetName(), @@ -518,6 +524,7 @@ func (tm *taskMatcherImpl) pollOrForward( tm.scope.IncCounter(metrics.PollSuccessWithSyncPerTaskListCounter) } tm.scope.RecordTimer(metrics.PollLocalMatchLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollLocalMatchLatencyPerTaskListHistogram, time.Since(startT)) tm.scope.IncCounter(metrics.PollSuccessPerTaskListCounter) event.Log(event.E{ TaskListName: tm.tasklist.GetName(), @@ -559,6 +566,7 @@ func (tm *taskMatcherImpl) pollOrForward( if task, err := tm.fwdr.ForwardPoll(ctx); err == nil { token.release() tm.scope.RecordTimer(metrics.PollForwardMatchLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollForwardMatchLatencyPerTaskListHistogram, time.Since(startT)) event.Log(event.E{ TaskListName: tm.tasklist.GetName(), TaskListType: tm.tasklist.GetType(), @@ -585,6 +593,7 @@ func (tm *taskMatcherImpl) poll( tm.scope.IncCounter(metrics.PollSuccessWithSyncPerTaskListCounter) } tm.scope.RecordTimer(metrics.PollLocalMatchAfterForwardFailedLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollLocalMatchAfterForwardFailedLatencyPerTaskListHistogram, time.Since(startT)) tm.scope.IncCounter(metrics.PollSuccessPerTaskListCounter) event.Log(event.E{ TaskListName: tm.tasklist.GetName(), @@ -605,6 +614,7 @@ func (tm *taskMatcherImpl) poll( tm.scope.IncCounter(metrics.PollSuccessWithSyncPerTaskListCounter) } tm.scope.RecordTimer(metrics.PollLocalMatchAfterForwardFailedLatencyPerTaskList, time.Since(startT)) + tm.scope.RecordHistogramDuration(metrics.PollLocalMatchAfterForwardFailedLatencyPerTaskListHistogram, time.Since(startT)) tm.scope.IncCounter(metrics.PollSuccessPerTaskListCounter) event.Log(event.E{ TaskListName: tm.tasklist.GetName(), diff --git a/service/matching/tasklist/matcher_test.go b/service/matching/tasklist/matcher_test.go index bd7323f0580..a2d332c388e 100644 --- a/service/matching/tasklist/matcher_test.go +++ b/service/matching/tasklist/matcher_test.go @@ -483,6 +483,8 @@ func (t *MatcherTestSuite) TestMustOfferRemoteRateLimit() { scope := mocks.Scope{} scope.On("IncCounter", metrics.AsyncMatchForwardTaskThrottleErrorPerTasklist) scope.On("RecordTimer", mock.Anything, mock.Anything) + scope.On("IntExponentialHistogram", mock.Anything, mock.AnythingOfType("int")).Return().Maybe() + scope.On("RecordHistogramDuration", mock.Anything, mock.AnythingOfType("time.Duration")).Return().Maybe() t.matcher.scope = &scope completionFunc := func(*persistence.TaskInfo, error) {} for i := 0; i < 5; i++ { diff --git a/service/matching/tasklist/task_reader.go b/service/matching/tasklist/task_reader.go index b5b7fb99ac7..8f81f91a3dc 100644 --- a/service/matching/tasklist/task_reader.go +++ b/service/matching/tasklist/task_reader.go @@ -479,9 +479,11 @@ func (tr *taskReader) dispatchSingleTaskFromBuffer(taskInfo *persistence.TaskInf } task := newInternalTask(taskInfo, tr.completeTask, types.TaskSourceDbBacklog, "", false, nil, isolationGroup) dispatchCtx, cancel := tr.newDispatchContext(isolationGroup, isolationDuration) + asyncMatchStart := time.Now() timerScope := tr.scope.StartTimer(metrics.AsyncMatchLatencyPerTaskList) err := tr.dispatchTask(dispatchCtx, task) timerScope.Stop() + tr.scope.RecordHistogramDuration(metrics.AsyncMatchLatencyPerTaskListHistogram, time.Since(asyncMatchStart)) cancel() if err == nil { diff --git a/service/worker/diagnostics/workflow.go b/service/worker/diagnostics/workflow.go index b38f90eaa17..5857412a6cb 100644 --- a/service/worker/diagnostics/workflow.go +++ b/service/worker/diagnostics/workflow.go @@ -110,8 +110,12 @@ type retryIssuesResult struct { func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflowInput) (*DiagnosticsWorkflowResult, error) { scope := w.metricsClient.Scope(metrics.DiagnosticsWorkflowScope, metrics.DomainTag(params.Domain)) scope.IncCounter(metrics.DiagnosticsWorkflowStartedCount) + diagStart := workflow.Now(ctx) sw := scope.StartTimer(metrics.DiagnosticsWorkflowExecutionLatency) - defer sw.Stop() + defer func() { + sw.Stop() + scope.RecordHistogramDuration(metrics.DiagnosticsWorkflowExecutionLatencyHistogram, workflow.Now(ctx).Sub(diagStart)) + }() var timeoutsResult *timeoutDiagnostics var failureResult *failureDiagnostics diff --git a/service/worker/indexer/esProcessor.go b/service/worker/indexer/esProcessor.go index 172ba2096a5..1e86092edb9 100644 --- a/service/worker/indexer/esProcessor.go +++ b/service/worker/indexer/esProcessor.go @@ -60,6 +60,8 @@ type ( kafkaMessageWithMetrics struct { // value of ESProcessorImpl.mapToKafkaMsg message messaging.Message swFromAddToAck *metrics.Stopwatch // metric from message add to process, to message ack/nack + processStart time.Time + scope metrics.Scope } ) @@ -113,8 +115,9 @@ func (p *ESProcessorImpl) Add(request *bulk.GenericBulkableAddRequest, key strin actionWhenFoundDuplicates := func(key interface{}, value interface{}) error { return kafkaMsg.Ack() } + esProcessStart := time.Now() sw := p.scope.StartTimer(metrics.ESProcessorProcessMsgLatency) - mapVal := newKafkaMessageWithMetrics(kafkaMsg, &sw) + mapVal := newKafkaMessageWithMetrics(kafkaMsg, &sw, esProcessStart, p.scope) _, isDup, _ := p.mapToKafkaMsg.PutOrDo(key, mapVal, actionWhenFoundDuplicates) if isDup { return @@ -360,10 +363,12 @@ func getErrorMsgFromESResp(resp *bulk.GenericBulkResponseItem) string { return errMsg } -func newKafkaMessageWithMetrics(kafkaMsg messaging.Message, stopwatch *metrics.Stopwatch) *kafkaMessageWithMetrics { +func newKafkaMessageWithMetrics(kafkaMsg messaging.Message, stopwatch *metrics.Stopwatch, processStart time.Time, scope metrics.Scope) *kafkaMessageWithMetrics { return &kafkaMessageWithMetrics{ message: kafkaMsg, swFromAddToAck: stopwatch, + processStart: processStart, + scope: scope, } } @@ -371,6 +376,7 @@ func (km *kafkaMessageWithMetrics) Ack() { km.message.Ack() // nolint:errcheck if km.swFromAddToAck != nil { km.swFromAddToAck.Stop() + km.scope.RecordHistogramDuration(metrics.ESProcessorProcessMsgLatencyHistogram, time.Since(km.processStart)) } } @@ -378,5 +384,6 @@ func (km *kafkaMessageWithMetrics) Nack() { km.message.Nack() //nolint:errcheck if km.swFromAddToAck != nil { km.swFromAddToAck.Stop() + km.scope.RecordHistogramDuration(metrics.ESProcessorProcessMsgLatencyHistogram, time.Since(km.processStart)) } } diff --git a/service/worker/indexer/esProcessor_test.go b/service/worker/indexer/esProcessor_test.go index e05db5eeb9a..c26f1425697 100644 --- a/service/worker/indexer/esProcessor_test.go +++ b/service/worker/indexer/esProcessor_test.go @@ -196,9 +196,10 @@ func (s *esProcessorSuite) TestBulkAfterActionX() { } mockKafkaMsg := &msgMocks.Message{} - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) mockKafkaMsg.On("Ack").Return(nil).Once() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() s.esProcessor.bulkAfterAction(0, requests, response, nil) mockKafkaMsg.AssertExpectations(s.T()) } @@ -232,10 +233,11 @@ func (s *esProcessorSuite) TestBulkAfterAction_Nack() { payload := s.getEncodedMsg(wid, rid, domainID) mockKafkaMsg := &msgMocks.Message{} - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) mockKafkaMsg.On("Nack").Return(nil).Once() mockKafkaMsg.On("Value").Return(payload).Once() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() // s.mockBulkProcessor.On("RetrieveKafkaKey", request, mock.Anything, mock.Anything).Return(testKey) s.esProcessor.bulkAfterAction(0, requests, response, nil) mockKafkaMsg.AssertExpectations(s.T()) @@ -270,11 +272,12 @@ func (s *esProcessorSuite) TestBulkAfterAction_Error() { payload := s.getEncodedMsg(wid, rid, domainID) mockKafkaMsg := &msgMocks.Message{} - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) mockKafkaMsg.On("Nack").Return(nil).Once() mockKafkaMsg.On("Value").Return(payload).Once() s.mockScope.On("IncCounter", metrics.ESProcessorFailures).Once() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() s.esProcessor.bulkAfterAction(0, requests, response, &bulk.GenericError{Details: fmt.Errorf("some error")}) } @@ -307,12 +310,13 @@ func (s *esProcessorSuite) TestBulkAfterAction_Error_Nack() { payload := s.getEncodedMsg(wid, rid, domainID) mockKafkaMsg := &msgMocks.Message{} - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) mockKafkaMsg.On("Nack").Return(nil).Once() mockKafkaMsg.On("Ack").Return(nil).Once() // Expect Ack to be called mockKafkaMsg.On("Value").Return(payload).Once() s.mockScope.On("IncCounter", metrics.ESProcessorFailures).Once() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() s.esProcessor.bulkAfterAction(0, requests, response, &bulk.GenericError{Status: 404, Details: fmt.Errorf("some error")}) } @@ -329,6 +333,7 @@ func (s *esProcessorSuite) TestAckKafkaMsg() { s.Equal(1, s.esProcessor.mapToKafkaMsg.Len()) mockKafkaMsg.On("Ack").Return(nil).Once() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() s.esProcessor.ackKafkaMsg(key) mockKafkaMsg.AssertExpectations(s.T()) s.Equal(0, s.esProcessor.mapToKafkaMsg.Len()) @@ -347,6 +352,7 @@ func (s *esProcessorSuite) TestNackKafkaMsg() { s.Equal(1, s.esProcessor.mapToKafkaMsg.Len()) mockKafkaMsg.On("Nack").Return(nil).Once() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() s.esProcessor.nackKafkaMsg(key) mockKafkaMsg.AssertExpectations(s.T()) s.Equal(0, s.esProcessor.mapToKafkaMsg.Len()) @@ -377,7 +383,7 @@ func (s *esProcessorSuite) TestGetMsgWithInfo() { mockKafkaMsg := &msgMocks.Message{} mockKafkaMsg.On("Value").Return(payload).Once() - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) wid, rid, domainID := s.esProcessor.getMsgWithInfo(testKey) s.Equal(testWid, wid) @@ -389,7 +395,7 @@ func (s *esProcessorSuite) TestGetMsgInfo_Error() { testKey := "test-key" mockKafkaMsg := &msgMocks.Message{} mockKafkaMsg.On("Value").Return([]byte{}).Once() - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) wid, rid, domainID := s.esProcessor.getMsgWithInfo(testKey) s.Equal("", wid) @@ -523,13 +529,14 @@ func (s *esProcessorSuite) TestBulkAfterAction_Nack_Shadow_WithError() { payload := s.getEncodedMsg(wid, rid, domainID) mockKafkaMsg := &msgMocks.Message{} - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) // Mock Kafka message Nack and Value mockKafkaMsg.On("Nack").Return(nil).Once() mockKafkaMsg.On("Value").Return(payload).Once() s.mockScope.On("IncCounter", mock.AnythingOfType("metrics.MetricIdx")).Return() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() // Execute bulkAfterAction for primary processor with error s.esProcessor.bulkAfterAction(0, requests, response, mockErr) } @@ -563,13 +570,14 @@ func (s *esProcessorSuite) TestBulkAfterAction_Shadow_Fail_WithoutError() { payload := s.getEncodedMsg(wid, rid, domainID) mockKafkaMsg := &msgMocks.Message{} - mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch) + mapVal := newKafkaMessageWithMetrics(mockKafkaMsg, &testStopWatch, time.Now(), s.esProcessor.scope) s.esProcessor.mapToKafkaMsg.Put(testKey, mapVal) // Mock Kafka message Nack and Value mockKafkaMsg.On("Nack").Return(nil).Once() mockKafkaMsg.On("Value").Return(payload).Once() s.mockScope.On("IncCounter", mock.AnythingOfType("int")).Return() + s.mockScope.On("RecordHistogramDuration", metrics.ESProcessorProcessMsgLatencyHistogram, mock.AnythingOfType("time.Duration")).Once() // Execute bulkAfterAction for primary processor with error s.esProcessor.bulkAfterAction(0, requests, response, nil) } diff --git a/service/worker/indexer/indexer.go b/service/worker/indexer/indexer.go index 72b1f37fe2d..153980327fc 100644 --- a/service/worker/indexer/indexer.go +++ b/service/worker/indexer/indexer.go @@ -195,9 +195,11 @@ func (i *Indexer) messageProcessLoop(workerWG *sync.WaitGroup) { defer workerWG.Done() for msg := range i.consumer.Messages() { + indexProcessStart := time.Now() sw := i.scope.StartTimer(metrics.IndexProcessorProcessMsgLatency) err := i.process(msg) sw.Stop() + i.scope.RecordHistogramDuration(metrics.IndexProcessorProcessMsgLatencyHistogram, time.Since(indexProcessStart)) if err != nil { msg.Nack() //nolint:errcheck }