Skip to content

Commit 0ff2e6c

Browse files
committed
NetFX-Stack-Capture handl multi app domain by creating canary thread in each domain that is instrumented. Track life if canary thread and promote next available canary if current one is destroyed
1 parent 54ce58d commit 0ff2e6c

File tree

5 files changed

+63
-67
lines changed

5 files changed

+63
-67
lines changed

src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.cpp

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1535,23 +1535,9 @@ HRESULT STDMETHODCALLTYPE CorProfiler::JITCompilationStartedOnNetFramework(Funct
15351535
auto valid_loader_callsite = true;
15361536
if (is_desktop_iis)
15371537
{
1538-
// For IIS: prefer injection into BuildManager.InvokePreStartInitMethods (application AppDomain)
1539-
// but also allow injection into default AppDomain for continuous profiling support
1540-
bool is_build_manager_injection = module_metadata->assemblyName == WStr("System.Web") &&
1541-
caller.type.name == WStr("System.Web.Compilation.BuildManager") &&
1542-
caller.name == WStr("InvokePreStartInitMethods");
1543-
1544-
bool is_default_appdomain = corlib_module_loaded && module_metadata->app_domain_id == corlib_app_domain_id;
1545-
1546-
valid_loader_callsite = is_build_manager_injection || is_default_appdomain;
1547-
1548-
if (is_default_appdomain && !is_build_manager_injection)
1549-
{
1550-
Logger::Info("JITCompilationStarted: Allowing loader injection into default AppDomain for IIS continuous "
1551-
"profiling. ",
1552-
"AppDomain: ", module_metadata->app_domain_id, ", Assembly: ", module_metadata->assemblyName,
1553-
", Method: ", caller.type.name, ".", caller.name);
1554-
}
1538+
valid_loader_callsite = module_metadata->assemblyName == WStr("System.Web") &&
1539+
caller.type.name == WStr("System.Web.Compilation.BuildManager") &&
1540+
caller.name == WStr("InvokePreStartInitMethods");
15551541
}
15561542
else if (module_metadata->assemblyName == WStr("System") ||
15571543
module_metadata->assemblyName == WStr("System.Net.Http"))

src/OpenTelemetry.AutoInstrumentation.Native/profiler_stack_capture.cpp

Lines changed: 45 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,22 @@ HRESULT StackCaptureEngine::ThreadDestroyed(ThreadID threadId)
394394
trace::Logger::Info("[StackCapture] Canary thread destroyed - ManagedID=", threadId,
395395
", NativeID=", canaryThread_.nativeId);
396396
canaryThread_.reset();
397+
// threadNames_ has map of thread ID and names - find another canary if possible
398+
for (const auto& [managedId, name] : threadNames_)
399+
{
400+
if (options_.IsCanaryThread(name))
401+
{
402+
auto osThreadIt = activeThreads_.find(managedId);
403+
if (osThreadIt != activeThreads_.end())
404+
{
405+
canaryThread_ = CanaryThreadInfo{managedId, osThreadIt->second};
406+
trace::Logger::Info("[StackCapture] New canary thread designated after destruction - ManagedID=",
407+
managedId, ", NativeID=", osThreadIt->second, ", Name=", name);
408+
captureCondVar_.notify_all();
409+
break;
410+
}
411+
}
412+
}
397413
}
398414

399415
return S_OK;
@@ -412,20 +428,15 @@ HRESULT StackCaptureEngine::ThreadAssignedToOSThread(ThreadID managedThreadId, D
412428
auto nameIt = threadNames_.find(managedThreadId);
413429
if (nameIt != threadNames_.end())
414430
{
415-
if (nameIt->second == options_.canaryThreadName)
431+
if (options_.IsCanaryThread(nameIt->second))
416432
{
417433
canaryThread_ = CanaryThreadInfo{managedThreadId, osThreadId};
418434
trace::Logger::Info("[StackCapture] Canary thread designated via ThreadAssignedToOSThread - ManagedID=",
419435
managedThreadId, ", NativeID=", osThreadId, ", Name=", nameIt->second);
436+
captureCondVar_.notify_all();
420437
}
421438
}
422439

423-
// Notify waiting capture thread if canary was found
424-
if (canaryThread_.isValid())
425-
{
426-
captureCondVar_.notify_one();
427-
}
428-
429440
return S_OK;
430441
}
431442

@@ -435,20 +446,17 @@ HRESULT StackCaptureEngine::ThreadNameChanged(ThreadID threadId, ULONG cchName,
435446
return S_OK;
436447

437448
std::lock_guard<std::mutex> lock(threadListMutex_);
438-
if (canaryThread_.isValid())
439-
{
440-
return S_OK;
441-
}
442449

443450
std::wstring threadName(name, cchName);
444451
threadNames_[threadId] = threadName;
445452
trace::Logger::Debug("[StackCapture] ThreadNameChanged - ManagedID=", threadId, ", Name=", threadName);
446-
if (threadName == options_.canaryThreadName)
453+
if (options_.IsCanaryThread(threadName))
447454
{
448455
auto osThreadIt = activeThreads_.find(threadId);
449-
if (osThreadIt != activeThreads_.end())
456+
if (osThreadIt != activeThreads_.end() && !canaryThread_.isValid())
450457
{
451458
canaryThread_ = CanaryThreadInfo{threadId, osThreadIt->second};
459+
captureCondVar_.notify_all();
452460
trace::Logger::Info("[StackCapture] Canary thread designated via ThreadNameChanged - ManagedID=", threadId,
453461
", NativeID=", osThreadIt->second, ", Name=", threadName);
454462
}
@@ -460,16 +468,10 @@ HRESULT StackCaptureEngine::ThreadNameChanged(ThreadID threadId, ULONG cchName,
460468
}
461469
}
462470

463-
// Notify waiting capture thread if canary was found
464-
if (canaryThread_.isValid())
465-
{
466-
captureCondVar_.notify_one();
467-
}
468-
469471
return S_OK;
470472
}
471473

472-
bool StackCaptureEngine::SafetyProbe()
474+
bool StackCaptureEngine::SafetyProbe(const CanaryThreadInfo& canaryInfo)
473475
{
474476
if (!invocationQueue_)
475477
return true;
@@ -479,7 +481,7 @@ bool StackCaptureEngine::SafetyProbe()
479481

480482
try
481483
{
482-
ScopedThreadSuspend canaryThread(canaryThread_.nativeId);
484+
ScopedThreadSuspend canaryThread(canaryInfo.nativeId);
483485

484486
CONTEXT canaryCtx = {};
485487
canaryCtx.ContextFlags = CONTEXT_FULL;
@@ -493,7 +495,7 @@ bool StackCaptureEngine::SafetyProbe()
493495
return false;
494496
}
495497

496-
auto canaryManagedId = canaryThread_.managedId;
498+
auto canaryManagedId = canaryInfo.managedId;
497499

498500
status = invocationQueue_->Invoke(
499501
[this, canaryManagedId, canaryCtx, &snapshotHr]()
@@ -607,34 +609,43 @@ HRESULT StackCaptureEngine::CaptureStackSeeded(ThreadID managedThrea
607609
return hr;
608610
}
609611

610-
bool StackCaptureEngine::WaitForCanaryThread(std::chrono::milliseconds timeout)
612+
CanaryThreadInfo StackCaptureEngine::WaitForCanaryThread(std::chrono::milliseconds timeout)
611613
{
612614
trace::Logger::Debug("[StackCapture] Waiting for canary thread (timeout=", timeout.count(), "ms)");
613-
std::unique_lock<std::mutex> lock(threadListMutex_);
614-
bool result =
615-
captureCondVar_.wait_for(lock, timeout, [this]() { return stopRequested_.load() || canaryThread_.isValid(); });
616-
617-
if (!result)
615+
CanaryThreadInfo canary;
618616
{
619-
trace::Logger::Warn("[StackCapture] Canary thread wait timed out after ", timeout.count(), "ms");
617+
std::unique_lock<std::mutex> lock(threadListMutex_);
618+
bool result = captureCondVar_.wait_for(lock, timeout,
619+
[this]() { return stopRequested_.load() || canaryThread_.isValid(); });
620+
621+
if (!result)
622+
{
623+
trace::Logger::Warn("[StackCapture] Canary thread wait timed out after ", timeout.count(), "ms");
624+
}
625+
else
626+
{
627+
canary = canaryThread_;
628+
trace::Logger::Debug("[StackCapture] Canary thread ready - ManagedID=", canary.managedId,
629+
", NativeID=", canary.nativeId);
630+
}
620631
}
621632

622-
return result;
633+
return canary;
623634
}
624635

625636
HRESULT StackCaptureEngine::CaptureStacks(std::unordered_set<ThreadID> const& threads,
626637
continuous_profiler::StackSnapshotCallbackContext* clientData)
627638
{
628-
bool canaryReady = WaitForCanaryThread();
639+
auto canary = WaitForCanaryThread();
629640

630-
if (!canaryReady)
641+
if (!canary.isValid())
631642
return E_FAIL;
632643

633644
for (const auto& managedId : threads)
634645
{
635646
if (stopRequested_)
636647
break;
637-
if (managedId == canaryThread_.managedId)
648+
if (managedId == canary.managedId)
638649
continue;
639650
DWORD nativeId = 0;
640651
{
@@ -649,7 +660,7 @@ HRESULT StackCaptureEngine::CaptureStacks(std::unordered_set<ThreadID> const&
649660
try
650661
{
651662
ScopedThreadSuspend targetThread(nativeId);
652-
if (!SafetyProbe())
663+
if (!SafetyProbe(canary))
653664
{
654665
trace::Logger::Debug(
655666
"[StackCapture] CaptureStacks - Skipping thread due to safety probe failure. ManagedID=", managedId,

src/OpenTelemetry.AutoInstrumentation.Native/profiler_stack_capture.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ namespace ProfilerStackCapture {
3636
struct CaptureOptions {
3737
std::chrono::milliseconds probeTimeout = std::chrono::milliseconds(250);
3838
const wchar_t* canaryThreadName = L"OpenTelemetry Profiler Canary Thread";
39+
bool IsCanaryThread(const std::wstring& threadName) const {
40+
return threadName.find(canaryThreadName) == 0;
41+
}
3942
};
4043

4144
class IProfilerApi {
@@ -134,7 +137,7 @@ namespace ProfilerStackCapture {
134137
public:
135138
explicit StackCaptureEngine(std::unique_ptr<IProfilerApi> profilerApi, const CaptureOptions& options = {});
136139
~StackCaptureEngine();
137-
bool WaitForCanaryThread(std::chrono::milliseconds timeout = std::chrono::milliseconds(0));
140+
CanaryThreadInfo WaitForCanaryThread(std::chrono::milliseconds timeout = std::chrono::milliseconds(0));
138141
HRESULT CaptureStacks(const std::unordered_set<ThreadID> &threads, continuous_profiler::StackSnapshotCallbackContext* clientData);
139142
void Stop();
140143

@@ -145,7 +148,7 @@ namespace ProfilerStackCapture {
145148

146149
private:
147150
HRESULT CaptureStackSeeded(ThreadID managedThreadId, HANDLE threadHandle, StackCaptureContext* stackCaptureContext);
148-
bool SafetyProbe();
151+
bool SafetyProbe(const CanaryThreadInfo& canaryInfo);
149152

150153
std::unique_ptr<IProfilerApi> profilerApi_;
151154
CaptureOptions options_;

src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/CanaryThreadManager.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public bool Start(TimeSpan timeout)
4343

4444
_canaryThread = new Thread(CanaryThreadProc)
4545
{
46-
Name = CanaryThreadName,
46+
Name = $"{CanaryThreadName} - {AppDomain.CurrentDomain.Id})",
4747
IsBackground = true,
4848
};
4949

src/OpenTelemetry.AutoInstrumentation/Instrumentation.cs

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -285,21 +285,17 @@ private static void InitializeSampling()
285285

286286
NativeMethods.ConfigureNativeContinuousProfiler(threadSamplingEnabled, threadSamplingInterval, allocationSamplingEnabled, maxMemorySamplesPerMinute, selectiveSamplingInterval);
287287
#if NETFRAMEWORK
288-
if (AppDomain.CurrentDomain.IsDefaultAppDomain())
288+
// On .NET Framework, we need a dedicated canary thread for seeded stack walking
289+
_canaryThreadManager = new CanaryThreadManager();
290+
if (!_canaryThreadManager.Start(TimeSpan.FromSeconds(5)))
289291
{
290-
// On .NET Framework, we need a dedicated canary thread for seeded stack walking
291-
// host the canary thread manager in the default app domain only to avoid multiple canary threads
292-
_canaryThreadManager = new CanaryThreadManager();
293-
if (!_canaryThreadManager.Start(TimeSpan.FromSeconds(5)))
294-
{
295-
Logger.Error("Failed to start canary thread. Continuous profiling will not be enabled.");
296-
_canaryThreadManager.Dispose();
297-
_canaryThreadManager = null;
298-
return;
299-
}
300-
301-
Logger.Information("Canary thread started successfully for .NET Framework profiling.");
292+
Logger.Error("Failed to start canary thread. Continuous profiling will not be enabled.");
293+
_canaryThreadManager.Dispose();
294+
_canaryThreadManager = null;
295+
return;
302296
}
297+
298+
Logger.Information("Canary thread started successfully for .NET Framework profiling.");
303299
#endif
304300
_sampleExporter = _sampleExporterBuilder?.Build();
305301
}

0 commit comments

Comments
 (0)