VihasMakwana
diff --git a/‎.chloggen/logiraptor_remove-policy-latency.yaml‎
Lines changed: 27 additions & 0 deletions b/‎.chloggen/logiraptor_remove-policy-latency.yaml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎processor/tailsamplingprocessor/documentation.md‎
Lines changed: 28 additions & 8 deletions b/‎processor/tailsamplingprocessor/documentation.md‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎processor/tailsamplingprocessor/internal/metadata/generated_telemetry.go‎
Lines changed: 14 additions & 8 deletions b/‎processor/tailsamplingprocessor/internal/metadata/generated_telemetry.go‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎processor/tailsamplingprocessor/internal/metadatatest/generated_telemetrytest.go‎
Lines changed: 32 additions & 15 deletions b/‎processor/tailsamplingprocessor/internal/metadatatest/generated_telemetrytest.go‎
Lines changed: 32 additions & 15 deletions
diff --git a/‎processor/tailsamplingprocessor/internal/metadatatest/generated_telemetrytest_test.go‎
Lines changed: 8 additions & 4 deletions b/‎processor/tailsamplingprocessor/internal/metadatatest/generated_telemetrytest_test.go‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎processor/tailsamplingprocessor/metadata.yaml‎
Lines changed: 22 additions & 10 deletions b/‎processor/tailsamplingprocessor/metadata.yaml‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎processor/tailsamplingprocessor/processor.go‎
Lines changed: 34 additions & 12 deletions b/‎processor/tailsamplingprocessor/processor.go‎
Lines changed: 34 additions & 12 deletions
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: breaking
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: processor/tail_sampling
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Replace policy latency metric with total time spent executing specific sampling policy.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [42620]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: The existing latency metric was misleading and expensive to compute. The new cpu time metric can be used to find expensive policies instead.
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
@@ -75,14 +75,6 @@ Counts the arrival of new traces [Development]
 | ---- | ----------- | ---------- | --------- | --------- |
 | {traces} | Sum | Int | true | Development |
 
-### otelcol_processor_tail_sampling_sampling_decision_latency
-
-Latency (in microseconds) of a given sampling policy [Development]
-
-| Unit | Metric Type | Value Type | Stability |
-| ---- | ----------- | ---------- | --------- |
-| µs | Histogram | Int | Development |
-
 ### otelcol_processor_tail_sampling_sampling_decision_timer_latency
 
 Latency (in milliseconds) of each run of the sampling decision timer [Development]
@@ -107,6 +99,34 @@ Count of sampling policy evaluation errors [Development]
 | ---- | ----------- | ---------- | --------- | --------- |
 | {errors} | Sum | Int | true | Development |
 
+### otelcol_processor_tail_sampling_sampling_policy_execution_count
+
+Total number of executions of a specific sampling policy [Development]
+
+| Unit | Metric Type | Value Type | Monotonic | Stability |
+| ---- | ----------- | ---------- | --------- | --------- |
+| {executions} | Sum | Int | true | Development |
+
+#### Attributes
+
+| Name | Description | Values |
+| ---- | ----------- | ------ |
+| policy | Name of the policy | Any Str |
+
+### otelcol_processor_tail_sampling_sampling_policy_execution_time_sum
+
+Total time spent (in microseconds) executing a specific sampling policy [Development]
+
+| Unit | Metric Type | Value Type | Monotonic | Stability |
+| ---- | ----------- | ---------- | --------- | --------- |
+| µs | Sum | Int | true | Development |
+
+#### Attributes
+
+| Name | Description | Values |
+| ---- | ----------- | ------ |
+| policy | Name of the policy | Any Str |
+
 ### otelcol_processor_tail_sampling_sampling_trace_dropped_too_early
 
 Count of traces that needed to be dropped before the configured wait time [Development]
 
@@ -83,16 +83,6 @@ telemetry:
         value_type: int
         monotonic: true
 
-    processor_tail_sampling_sampling_decision_latency:
-      description: Latency (in microseconds) of a given sampling policy
-      stability:
-        level: development
-      unit: µs
-      enabled: true
-      histogram:
-        value_type: int
-        bucket_boundaries: [1, 2, 5, 10, 25, 50, 75, 100, 150, 200, 300, 400, 500, 750, 1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 50000]
-
     processor_tail_sampling_sampling_decision_timer_latency:
       description: Latency (in milliseconds) of each run of the sampling decision timer
       stability:
@@ -102,6 +92,7 @@ telemetry:
       histogram:
         value_type: int
         bucket_boundaries: [1, 2, 5, 10, 25, 50, 75, 100, 150, 200, 300, 400, 500, 750, 1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 50000]
+
     processor_tail_sampling_sampling_late_span_age:
       description: Time (in seconds) from the sampling decision was taken and the arrival of a late span
       stability:
@@ -121,6 +112,27 @@ telemetry:
         value_type: int
         monotonic: true
 
+    processor_tail_sampling_sampling_policy_execution_count:
+      description: Total number of executions of a specific sampling policy
+      stability:
+        level: development
+      unit: "{executions}"
+      enabled: true
+      sum:
+        value_type: int
+        monotonic: true
+      attributes: [policy]
+
+    processor_tail_sampling_sampling_policy_execution_time_sum:
+      description: Total time spent (in microseconds) executing a specific sampling policy
+      stability:
+        level: development
+      unit: µs
+      enabled: true
+      sum:
+        value_type: int
+        monotonic: true
+      attributes: [policy]
 
     processor_tail_sampling_sampling_trace_dropped_too_early:
       description: Count of traces that needed to be dropped before the configured wait time
 
@@ -269,28 +269,45 @@ type policyDecisionMetrics struct {
 	spansSampled  int64
 }
 
-type policyMetrics struct {
+type policyTickMetrics struct {
 	idNotFoundOnMapCount, evaluateErrorCount, decisionSampled, decisionNotSampled, decisionDropped int64
 	tracesSampledByPolicyDecision                                                                  []map[samplingpolicy.Decision]policyDecisionMetrics
+	cumulativeExecutionTime                                                                        []perPolicyExecutionTime
 }
 
-func newPolicyMetrics(numPolicies int) *policyMetrics {
+// perPolicyExecutionTime is a struct for holding the cumulative execution time
+// and number of executions of a policy. This is an optimization to avoid
+// instrumentation overhead in the decision making loop.
+type perPolicyExecutionTime struct {
+	executionTime  time.Duration
+	executionCount int64
+}
+
+func newPolicyTickMetrics(numPolicies int) *policyTickMetrics {
 	tracesSampledByPolicyDecision := make([]map[samplingpolicy.Decision]policyDecisionMetrics, numPolicies)
 	for i := range tracesSampledByPolicyDecision {
 		tracesSampledByPolicyDecision[i] = make(map[samplingpolicy.Decision]policyDecisionMetrics)
 	}
-	return &policyMetrics{
+	return &policyTickMetrics{
 		tracesSampledByPolicyDecision: tracesSampledByPolicyDecision,
+		cumulativeExecutionTime:       make([]perPolicyExecutionTime, numPolicies),
 	}
 }
 
-func (m *policyMetrics) addDecision(policyIndex int, decision samplingpolicy.Decision, spansSampled int64) {
+func (m *policyTickMetrics) addDecision(policyIndex int, decision samplingpolicy.Decision, spansSampled int64) {
 	stats := m.tracesSampledByPolicyDecision[policyIndex][decision]
 	stats.tracesSampled++
 	stats.spansSampled += spansSampled
 	m.tracesSampledByPolicyDecision[policyIndex][decision] = stats
 }
 
+func (m *policyTickMetrics) addDecisionTime(policyIndex int, decisionTime time.Duration) {
+	perPolicyExecutionTime := m.cumulativeExecutionTime[policyIndex]
+	perPolicyExecutionTime.executionTime += decisionTime
+	perPolicyExecutionTime.executionCount++
+	m.cumulativeExecutionTime[policyIndex] = perPolicyExecutionTime
+}
+
 func (tsp *tailSamplingSpanProcessor) loadSamplingPolicy(cfgs []PolicyCfg) error {
 	telemetrySettings := tsp.set.TelemetrySettings
 	componentID := tsp.set.ID.Name()
@@ -380,8 +397,9 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
 	}
 
 	ctx := context.Background()
-	metrics := newPolicyMetrics(len(tsp.policies))
+	metrics := newPolicyTickMetrics(len(tsp.policies))
 	startTime := time.Now()
+	globalTracesSampledByDecision := make(map[samplingpolicy.Decision]int64)
 
 	batch, _ := tsp.decisionBatcher.CloseCurrentAndTakeFirstBatch()
 	batchLen := len(batch)
@@ -396,8 +414,7 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
 		trace.DecisionTime = time.Now()
 
 		decision := tsp.makeDecision(id, trace, metrics)
-
-		tsp.telemetry.ProcessorTailSamplingGlobalCountTracesSampled.Add(tsp.ctx, 1, decisionToAttributes[decision])
+		globalTracesSampledByDecision[decision]++
 
 		// Sampled or not, remove the batches
 		trace.Lock()
@@ -413,18 +430,24 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
 		}
 	}
 
-	tsp.telemetry.ProcessorTailSamplingSamplingDecisionTimerLatency.Record(tsp.ctx, int64(time.Since(startTime)/time.Millisecond))
+	tsp.telemetry.ProcessorTailSamplingSamplingDecisionTimerLatency.Record(tsp.ctx, time.Since(startTime).Milliseconds())
 	tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load()))
 	tsp.telemetry.ProcessorTailSamplingSamplingTraceDroppedTooEarly.Add(tsp.ctx, metrics.idNotFoundOnMapCount)
 	tsp.telemetry.ProcessorTailSamplingSamplingPolicyEvaluationError.Add(tsp.ctx, metrics.evaluateErrorCount)
 
+	for decision, count := range globalTracesSampledByDecision {
+		tsp.telemetry.ProcessorTailSamplingGlobalCountTracesSampled.Add(tsp.ctx, count, decisionToAttributes[decision])
+	}
+
 	for i, p := range tsp.policies {
 		for decision, stats := range metrics.tracesSampledByPolicyDecision[i] {
 			tsp.telemetry.ProcessorTailSamplingCountTracesSampled.Add(tsp.ctx, int64(stats.tracesSampled), p.attribute, decisionToAttributes[decision])
 			if telemetry.IsMetricStatCountSpansSampledEnabled() {
 				tsp.telemetry.ProcessorTailSamplingCountSpansSampled.Add(tsp.ctx, stats.spansSampled, p.attribute, decisionToAttributes[decision])
 			}
 		}
+		tsp.telemetry.ProcessorTailSamplingSamplingPolicyExecutionTimeSum.Add(tsp.ctx, metrics.cumulativeExecutionTime[i].executionTime.Microseconds(), p.attribute)
+		tsp.telemetry.ProcessorTailSamplingSamplingPolicyExecutionCount.Add(tsp.ctx, metrics.cumulativeExecutionTime[i].executionCount, p.attribute)
 	}
 
 	tsp.logger.Debug("Sampling policy evaluation completed",
@@ -437,7 +460,7 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
 	)
 }
 
-func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *samplingpolicy.TraceData, metrics *policyMetrics) samplingpolicy.Decision {
+func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *samplingpolicy.TraceData, metrics *policyTickMetrics) samplingpolicy.Decision {
 	finalDecision := samplingpolicy.NotSampled
 	samplingDecisions := map[samplingpolicy.Decision]*policy{
 		samplingpolicy.Error:            nil,
@@ -449,13 +472,12 @@ func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *sa
 	}
 
 	ctx := context.Background()
-	startTime := time.Now()
 
 	// Check all policies before making a final decision.
 	for i, p := range tsp.policies {
+		startTime := time.Now()
 		decision, err := p.evaluator.Evaluate(ctx, id, trace)
-		latency := time.Since(startTime)
-		tsp.telemetry.ProcessorTailSamplingSamplingDecisionLatency.Record(ctx, int64(latency/time.Microsecond), p.attribute)
+		metrics.addDecisionTime(i, time.Since(startTime))
 
 		if err != nil {
 			if samplingDecisions[samplingpolicy.Error] == nil {