-
Notifications
You must be signed in to change notification settings - Fork 155
Expand file tree
/
Copy pathDataStreamsManager.cs
More file actions
360 lines (317 loc) · 14.8 KB
/
DataStreamsManager.cs
File metadata and controls
360 lines (317 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
// <copyright file="DataStreamsManager.cs" company="Datadog">
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
// </copyright>
#nullable enable
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using Datadog.Trace.Agent.DiscoveryService;
using Datadog.Trace.Configuration;
using Datadog.Trace.ContinuousProfiler;
using Datadog.Trace.DataStreamsMonitoring.Aggregation;
using Datadog.Trace.DataStreamsMonitoring.Hashes;
using Datadog.Trace.DataStreamsMonitoring.TransactionTracking;
using Datadog.Trace.Headers;
using Datadog.Trace.Logging;
using Datadog.Trace.PlatformHelpers;
using Datadog.Trace.Vendors.Serilog.Events;
namespace Datadog.Trace.DataStreamsMonitoring;
/// <summary>
/// Manages all the data streams monitoring behaviour
/// </summary>
internal sealed class DataStreamsManager
{
private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor<DataStreamsManager>();
private static readonly AsyncLocal<PathwayContext?> LastConsumePathway = new(); // saves the context on consume checkpointing only
private readonly object _nodeHashUpdateLock = new();
private readonly ConcurrentDictionary<string, RateLimiter> _schemaRateLimiters = new();
private readonly IDiscoveryService _discoveryService;
private readonly DataStreamsExtractorRegistry _registry;
private readonly IDisposable _updateSubscription;
private readonly bool _isLegacyDsmHeadersEnabled;
private long _nodeHashBase; // note that this actually represents a `ulong` that we have done an unsafe cast for
private MutableSettings _previousMutableSettings;
private string? _previousContainerTagsHash;
private bool _isEnabled;
private bool _isInDefaultState;
private IDataStreamsWriter? _writer;
public DataStreamsManager(
TracerSettings tracerSettings,
IDataStreamsWriter? writer,
IDiscoveryService discoveryService)
{
_isEnabled = writer is not null;
_isLegacyDsmHeadersEnabled = tracerSettings.IsDataStreamsLegacyHeadersEnabled;
_writer = writer;
_discoveryService = discoveryService;
_isInDefaultState = tracerSettings.IsDataStreamsMonitoringInDefaultState;
_registry = new DataStreamsExtractorRegistry(tracerSettings.DataStreamsTransactionExtractors);
Log.Debug(@"### Initializing DataStreamsManager with extractors (raw) {DataStreamsTransactionExtractors}", tracerSettings.DataStreamsTransactionExtractors);
Log.Debug(@"### Extractors loaded (parsed): {AsJson}", _registry.AsJson());
_previousMutableSettings = tracerSettings.Manager.InitialMutableSettings;
// even though the value will probably get updated by a callback when subscriptions happen just after,
// we still need to initialize it to a value from initial settings in case no callback fire
UpdateNodeHash(_previousMutableSettings, containerTagsHash: null);
// subscribing to changes calls the callback immediately if a value is present
discoveryService.SubscribeToChanges(UpdateHashWithContainerTags);
_updateSubscription = tracerSettings.Manager.SubscribeToChanges(UpdateHashWithNewSettings);
}
public bool IsEnabled
{
get => Volatile.Read(ref _isEnabled);
}
public bool IsInDefaultState
{
get => Volatile.Read(ref _isInDefaultState);
}
/// <summary> Callback for AgentConfiguration updates </summary>
private void UpdateHashWithContainerTags(AgentConfiguration conf)
{
lock (_nodeHashUpdateLock)
{
if (conf.ContainerTagsHash == _previousContainerTagsHash)
{
return;
}
UpdateNodeHash(_previousMutableSettings, conf.ContainerTagsHash);
_previousContainerTagsHash = conf.ContainerTagsHash;
}
}
/// <summary> Callback for MutableSettings updates </summary>
private void UpdateHashWithNewSettings(TracerSettings.SettingsManager.SettingChanges updates)
{
if (updates.UpdatedMutable is { } updated)
{
lock (_nodeHashUpdateLock)
{
UpdateNodeHash(updated, _previousContainerTagsHash);
_previousMutableSettings = updated;
}
}
}
private void UpdateNodeHash(MutableSettings settings, string? containerTagsHash)
{
// We don't yet support primary tag in .NET yet
var value = HashHelper.CalculateNodeHashBase(settings.DefaultServiceName, settings.Environment, primaryTag: null, settings.ProcessTags?.SerializedTags, containerTagsHash);
// Working around the fact we can't do Interlocked.Exchange with the struct
// and also that we can't do Interlocked.Exchange with a ulong in < .NET 5
Interlocked.Exchange(
ref _nodeHashBase,
unchecked((long)value.Value)); // reinterpret as a long
}
public static DataStreamsManager Create(
TracerSettings settings,
ProfilerSettings profilerSettings,
IDiscoveryService discoveryService)
{
var writer = settings.IsDataStreamsMonitoringEnabled
? DataStreamsWriter.Create(settings, profilerSettings, discoveryService)
: null;
return new DataStreamsManager(settings, writer, discoveryService);
}
public async Task DisposeAsync()
{
_updateSubscription.Dispose();
_discoveryService.RemoveSubscription(UpdateHashWithContainerTags);
Volatile.Write(ref _isEnabled, false);
var writer = Interlocked.Exchange(ref _writer, null);
if (writer is null)
{
return;
}
await writer.DisposeAsync().ConfigureAwait(false);
}
public async Task FlushAsync()
{
if (!IsEnabled)
{
return;
}
var writer = Volatile.Read(ref _writer);
if (writer is null)
{
return;
}
await writer.FlushAsync().ConfigureAwait(false);
}
/// <summary>
/// Trys to extract a <see cref="PathwayContext"/>, from the provided <paramref name="headers"/>
/// If data streams is disabled, or no pathway is present, returns null.
/// </summary>
public PathwayContext? ExtractPathwayContext<TCarrier>(TCarrier headers)
where TCarrier : IBinaryHeadersCollection
=> IsEnabled ? DataStreamsContextPropagator.Instance.Extract(headers) : null;
public List<DataStreamsTransactionExtractor>? GetExtractorsByType(DataStreamsTransactionExtractor.Type extractorType)
{
return _registry.GetExtractorsByType(extractorType);
}
/// <summary>
/// Injects a <see cref="PathwayContext"/> into headers
/// </summary>
/// <param name="context">The pathway context to inject</param>
/// <param name="headers">The header collection to inject the headers into</param>
public void InjectPathwayContext<TCarrier>(PathwayContext? context, TCarrier headers)
where TCarrier : IBinaryHeadersCollection
{
if (!IsEnabled || context is null)
{
return;
}
DataStreamsContextPropagator.Instance.Inject(context.Value, headers, _isLegacyDsmHeadersEnabled);
}
public void TrackTransaction(string transactionId, string checkpointName)
{
Log.Debug(@"### Tracking transaction {TransactionId} at checkpoint {CheckpointName}", transactionId, checkpointName);
if (!IsEnabled)
{
return;
}
var writer = Volatile.Read(ref _writer);
writer?.AddTransaction(new DataStreamsTransactionInfo(
transactionId,
DateTimeOffset.UtcNow.ToUnixTimeNanoseconds(),
checkpointName));
}
public void TrackBacklog(string tags, long value)
{
if (!IsEnabled)
{
return;
}
var writer = Volatile.Read(ref _writer);
var point = new BacklogPoint(tags, value, DateTimeOffset.UtcNow.ToUnixTimeNanoseconds());
writer?.AddBacklog(point);
}
/// <summary>
/// Trys to extract a <see cref="PathwayContext"/>, from the provided <paramref name="headers"/>
/// If data streams is disabled, or no pathway is present, returns null.
/// </summary>
public PathwayContext? ExtractPathwayContextAsBase64String<TCarrier>(TCarrier headers)
where TCarrier : IHeadersCollection
=> IsEnabled ? DataStreamsContextPropagator.Instance.ExtractAsBase64String(headers) : null;
/// <summary>
/// Injects a <see cref="PathwayContext"/> into headers
/// </summary>
/// <param name="context">The pathway context to inject</param>
/// <param name="headers">The header collection to inject the headers into</param>
public void InjectPathwayContextAsBase64String<TCarrier>(PathwayContext? context, TCarrier headers)
where TCarrier : IHeadersCollection
{
if (!IsEnabled)
{
return;
}
if (context is not null)
{
DataStreamsContextPropagator.Instance.InjectAsBase64String(context.Value, headers);
return;
}
// This shouldn't happen normally, as you should call SetCheckpoint before calling InjectPathwayContext
// But if data streams was disabled, you call SetCheckpoint, and then data streams is enabled
// you will hit this code path
Log.Debug("Attempted to inject null pathway context");
}
/// <summary>
/// Sets a checkpoint using the provided <see cref="PathwayContext"/>
/// NOTE: <paramref name="edgeTags"/> must be in correct sort order
/// </summary>
/// <param name="parentPathway">The pathway from upstream, if known</param>
/// <param name="checkpointKind">Is this a Produce or Consume operation?</param>
/// <param name="edgeTags">Edge tags to set for the new pathway. MUST be sorted in alphabetical order</param>
/// <param name="payloadSizeBytes">Payload size in bytes</param>
/// <param name="timeInQueueMs">Edge start time extracted from the message metadata. Used only if this is start of the pathway</param>
/// <returns>If disabled, returns <c>null</c>. Otherwise returns a new <see cref="PathwayContext"/></returns>
public PathwayContext? SetCheckpoint(
in PathwayContext? parentPathway,
CheckpointKind checkpointKind,
string[] edgeTags,
long payloadSizeBytes,
long timeInQueueMs)
{
if (!IsEnabled)
{
return null;
}
try
{
var previousContext = parentPathway;
if (previousContext == null && LastConsumePathway.Value != null && checkpointKind == CheckpointKind.Produce)
{
// We only enter here on produce: when we consume, the only thing that matters is the parent we'd have read from the inbound message, not what happened before.
// We want to use the context from the previous consume (but we'll give priority to the parent passed in param if set).
previousContext = LastConsumePathway.Value;
}
var nowNs = DateTimeOffset.UtcNow.ToUnixTimeNanoseconds();
// We should use timeInQueue to offset the edge / pathway start if this is a beginning of a pathway
// This allows tracking edge / pathway latency for pipelines starting with a queue (no producer instrumented upstream)
// by relying on the message timestamp.
// ReSharper disable once ArrangeRedundantParentheses
var edgeStartNs = previousContext == null && timeInQueueMs > 0 ? nowNs - (timeInQueueMs * 1_000_000) : nowNs;
var pathwayStartNs = previousContext?.PathwayStart ?? edgeStartNs;
// Don't blame me, blame the fact we can't do Volatile.Read with a ulong in .NET FX...
var nodeHashBase = new NodeHashBase(unchecked((ulong)Volatile.Read(ref _nodeHashBase)));
var nodeHash = HashHelper.CalculateNodeHash(nodeHashBase, edgeTags);
var parentHash = previousContext?.Hash ?? default;
var pathwayHash = HashHelper.CalculatePathwayHash(nodeHash, parentHash);
var writer = Volatile.Read(ref _writer);
writer?.Add(
new StatsPoint(
edgeTags: edgeTags,
hash: pathwayHash,
parentHash: parentHash,
timestampNs: nowNs,
pathwayLatencyNs: nowNs - pathwayStartNs,
edgeLatencyNs: nowNs - (previousContext?.EdgeStart ?? edgeStartNs),
payloadSizeBytes));
var pathway = new PathwayContext(
hash: pathwayHash,
pathwayStartNs: pathwayStartNs,
edgeStartNs: edgeStartNs);
if (Log.IsEnabled(LogEventLevel.Debug))
{
Log.Debug(
"SetCheckpoint with {PathwayHash}, {PathwayStart}, {EdgeStart}",
pathway.Hash,
pathway.PathwayStart,
pathway.EdgeStart);
}
// overwrite the previous checkpoint, so it can be used in the future if needed
if (checkpointKind == CheckpointKind.Consume)
{
LastConsumePathway.Value = pathway;
}
return pathway;
}
catch (Exception ex)
{
Log.Error(ex, "Error setting a data streams checkpoint. Disabling data streams monitoring");
// Set this to false out of an abundance of caution.
// We will look at being less conservative in the future
// if we see intermittent errors for some reason.
Volatile.Write(ref _isEnabled, false);
return null;
}
}
/// <summary>
/// Make sure we only extract the schema (a costly operation) on select occasions
/// </summary>
public bool ShouldExtractSchema(Span span, string operation, out int weight)
{
var limiter = _schemaRateLimiters.GetOrAdd(operation, _ => new RateLimiter());
if (limiter.PeekDecision())
{
// we only want to "consume" a decision to extract the schema for a span that we are going to keep
// && we don't want to make the sampling decision if we know we have no chance of getting selected by the rate limiter
var spanSamplingDecision = span.Context.GetOrMakeSamplingDecision();
if (spanSamplingDecision != null && SamplingPriorityValues.IsKeep(spanSamplingDecision.Value))
{
return limiter.GetDecision(out weight);
}
}
weight = 0;
return false;
}
}