Skip to content

Commit f6839f6

Browse files
Merge pull request #1098 from erikdarlingdata/feature/1096-exclude-cdc-long-running
Exclude CDC capture jobs from long-running query alerts (Dashboard + Lite) (#1096)
2 parents 5c5e557 + fd68524 commit f6839f6

13 files changed

Lines changed: 115 additions & 10 deletions

Dashboard/MainWindow.xaml.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1472,7 +1472,7 @@ private async Task CheckAllServerAlertsAsync()
14721472
var connectionString = server.GetConnectionString(_credentialService);
14731473
var databaseService = new DatabaseService(connectionString);
14741474
var connStatus = _serverManager.GetConnectionStatus(server.Id);
1475-
var health = await databaseService.GetAlertHealthAsync(connStatus.SqlEngineEdition, prefs.LongRunningQueryThresholdMinutes, prefs.LongRunningJobMultiplier, prefs.LongRunningQueryMaxResults, prefs.LongRunningQueryExcludeSpServerDiagnostics, prefs.LongRunningQueryExcludeWaitFor, prefs.LongRunningQueryExcludeBackups, prefs.LongRunningQueryExcludeMiscWaits, prefs.AlertExcludedDatabases);
1475+
var health = await databaseService.GetAlertHealthAsync(connStatus.SqlEngineEdition, prefs.LongRunningQueryThresholdMinutes, prefs.LongRunningJobMultiplier, prefs.LongRunningQueryMaxResults, prefs.LongRunningQueryExcludeSpServerDiagnostics, prefs.LongRunningQueryExcludeWaitFor, prefs.LongRunningQueryExcludeBackups, prefs.LongRunningQueryExcludeMiscWaits, prefs.LongRunningQueryExcludeCdc, prefs.AlertExcludedDatabases);
14761476

14771477
if (health.IsOnline)
14781478
{

Dashboard/Models/UserPreferences.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ public class UserPreferences
102102
public bool LongRunningQueryExcludeWaitFor { get; set; } = true;
103103
public bool LongRunningQueryExcludeBackups { get; set; } = true;
104104
public bool LongRunningQueryExcludeMiscWaits { get; set; } = true;
105+
public bool LongRunningQueryExcludeCdc { get; set; } = true; // Exclude CDC capture jobs (sp_MScdc_capture_job / sp_cdc_scan)
105106
public bool NotifyOnTempDbSpace { get; set; } = true;
106107
public int TempDbSpaceThresholdPercent { get; set; } = 80; // Alert when TempDB used > X%
107108
public bool NotifyOnLongRunningJobs { get; set; } = true;

Dashboard/Services/DatabaseService.NocHealth.cs

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ public async Task<AlertHealthResult> GetAlertHealthAsync(
131131
bool excludeWaitFor = true,
132132
bool excludeBackups = true,
133133
bool excludeMiscWaits = true,
134+
bool excludeCdc = true,
134135
IReadOnlyList<string>? excludedDatabases = null)
135136
{
136137
var result = new AlertHealthResult();
@@ -149,7 +150,7 @@ public async Task<AlertHealthResult> GetAlertHealthAsync(
149150
? GetFilteredDeadlockCountAsync(connection, excludedDatabases)
150151
: null;
151152
var poisonWaitTask = GetPoisonWaitDeltasAsync(connection);
152-
var longRunningTask = GetLongRunningQueriesAsync(connection, longRunningQueryThresholdMinutes, longRunningQueryMaxResults, excludeSpServerDiagnostics, excludeWaitFor, excludeBackups, excludeMiscWaits);
153+
var longRunningTask = GetLongRunningQueriesAsync(connection, longRunningQueryThresholdMinutes, longRunningQueryMaxResults, excludeSpServerDiagnostics, excludeWaitFor, excludeBackups, excludeMiscWaits, excludeCdc);
153154
var tempDbTask = GetTempDbSpaceAsync(connection);
154155
var anomalousJobTask = GetAnomalousJobsAsync(connection, longRunningJobMultiplier);
155156
var missingCaptureTask = GetMissingCaptureSessionsAsync(connection);
@@ -751,7 +752,8 @@ private async Task<List<LongRunningQueryInfo>> GetLongRunningQueriesAsync(
751752
bool excludeSpServerDiagnostics = true,
752753
bool excludeWaitFor = true,
753754
bool excludeBackups = true,
754-
bool excludeMiscWaits = true)
755+
bool excludeMiscWaits = true,
756+
bool excludeCdc = true)
755757
{
756758
maxResults = Math.Clamp(maxResults, 1, 1000);
757759

@@ -763,9 +765,45 @@ private async Task<List<LongRunningQueryInfo>> GetLongRunningQueriesAsync(
763765
? "AND r.wait_type NOT IN (N'BACKUPTHREAD', N'BACKUPIO')" : "";
764766
string miscWaitsFilter = excludeMiscWaits
765767
? "AND r.wait_type NOT IN (N'XE_LIVE_TARGET_TVF')" : "";
768+
// CDC capture runs continuously as a SQL Agent job (EXEC sys.sp_MScdc_capture_job -> sys.sp_cdc_scan),
769+
// so it permanently exceeds the duration threshold and none of the wait_type filters above catch it.
770+
//
771+
// Primary signal: resolve the capture job_id(s) from msdb.dbo.cdc_jobs and match the running session via
772+
// its SQL Agent program_name ('SQLAgent - TSQL JobStep (Job 0x<job_id> : Step N)'). This is CDC-specific
773+
// and never hides unrelated Agent jobs. The msdb reference is deferred through sp_executesql inside
774+
// TRY/CATCH so a login without msdb access gets a *catchable* error (not an uncatchable cross-db 916) and
775+
// cleanly falls back to a text match on the whole batch/object text.
776+
string cdcSetup = excludeCdc ? @"
777+
DECLARE @cdc_capture_jobs TABLE (job_id uniqueidentifier PRIMARY KEY);
778+
DECLARE @cdc_readable bit = 0;
779+
BEGIN TRY
780+
INSERT @cdc_capture_jobs (job_id)
781+
EXEC sys.sp_executesql N'SELECT cj.job_id FROM msdb.dbo.cdc_jobs AS cj WHERE cj.job_type = N''capture'';';
782+
SET @cdc_readable = 1;
783+
END TRY
784+
BEGIN CATCH
785+
SET @cdc_readable = 0;
786+
END CATCH;
787+
" : "";
788+
string cdcFilter = excludeCdc ? @"
789+
AND NOT
790+
(
791+
(
792+
@cdc_readable = 1
793+
AND s.program_name LIKE N'SQLAgent - TSQL JobStep (Job 0x%'
794+
AND TRY_CONVERT(uniqueidentifier, TRY_CONVERT(binary(16), SUBSTRING(s.program_name, 32, 32), 2))
795+
IN (SELECT j.job_id FROM @cdc_capture_jobs AS j)
796+
)
797+
OR
798+
(
799+
@cdc_readable = 0
800+
AND t.text IS NOT NULL
801+
AND (t.text LIKE N'%sp_MScdc_capture_job%' OR t.text LIKE N'%sp_cdc_scan%')
802+
)
803+
)" : "";
766804

767805
string query = @$"SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
768-
806+
{cdcSetup}
769807
SELECT TOP(@maxResults)
770808
r.session_id,
771809
DB_NAME(r.database_id) AS database_name,
@@ -787,6 +825,7 @@ CROSS APPLY sys.dm_exec_sql_text(r.sql_handle) AS t
787825
{waitForFilter}
788826
{backupsFilter}
789827
{miscWaitsFilter}
828+
{cdcFilter}
790829
ORDER BY r.total_elapsed_time DESC
791830
OPTION(MAXDOP 1, RECOMPILE);";
792831

Dashboard/SettingsWindow.xaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,9 @@
246246
Margin="0,0,0,4"/>
247247
<CheckBox x:Name="LrqExcludeMiscWaitsCheckBox"
248248
Content="Exclude miscellaneous waits (XE_LIVE_TARGET_TVF)"
249+
Margin="0,0,0,4"/>
250+
<CheckBox x:Name="LrqExcludeCdcCheckBox"
251+
Content="Exclude CDC capture jobs (sp_MScdc_capture_job, sp_cdc_scan)"
249252
Margin="0,0,0,0"/>
250253
<StackPanel Orientation="Horizontal" Margin="0,8,0,0">
251254
<CheckBox x:Name="NotifyOnTempDbSpaceCheckBox"

Dashboard/SettingsWindow.xaml.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ private void LoadSettings()
186186
LrqExcludeWaitForCheckBox.IsChecked = prefs.LongRunningQueryExcludeWaitFor;
187187
LrqExcludeBackupsCheckBox.IsChecked = prefs.LongRunningQueryExcludeBackups;
188188
LrqExcludeMiscWaitsCheckBox.IsChecked = prefs.LongRunningQueryExcludeMiscWaits;
189+
LrqExcludeCdcCheckBox.IsChecked = prefs.LongRunningQueryExcludeCdc;
189190
AlertExcludedDatabasesTextBox.Text = string.Join(", ", prefs.AlertExcludedDatabases);
190191
NotifyOnTempDbSpaceCheckBox.IsChecked = prefs.NotifyOnTempDbSpace;
191192
TempDbSpaceThresholdTextBox.Text = prefs.TempDbSpaceThresholdPercent.ToString(CultureInfo.InvariantCulture);
@@ -683,6 +684,7 @@ private async void OkButton_Click(object sender, RoutedEventArgs e)
683684
prefs.LongRunningQueryExcludeWaitFor = LrqExcludeWaitForCheckBox.IsChecked == true;
684685
prefs.LongRunningQueryExcludeBackups = LrqExcludeBackupsCheckBox.IsChecked == true;
685686
prefs.LongRunningQueryExcludeMiscWaits = LrqExcludeMiscWaitsCheckBox.IsChecked == true;
687+
prefs.LongRunningQueryExcludeCdc = LrqExcludeCdcCheckBox.IsChecked == true;
686688
prefs.AlertExcludedDatabases = AlertExcludedDatabasesTextBox.Text
687689
.Split(',')
688690
.Select(s => s.Trim())

Lite/App.xaml.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ public partial class App : Application
101101
public static bool AlertLongRunningQueryExcludeWaitFor { get; set; } = true;
102102
public static bool AlertLongRunningQueryExcludeBackups { get; set; } = true;
103103
public static bool AlertLongRunningQueryExcludeMiscWaits { get; set; } = true;
104+
public static bool AlertLongRunningQueryExcludeCdc { get; set; } = true;
104105
public static List<string> AlertExcludedDatabases { get; set; } = new();
105106
public static bool AlertTempDbSpaceEnabled { get; set; } = true;
106107
public static int AlertTempDbSpaceThresholdPercent { get; set; } = 80;
@@ -472,6 +473,7 @@ public static void LoadAlertSettings()
472473
if (root.TryGetProperty("alert_long_running_query_exclude_waitfor", out v)) AlertLongRunningQueryExcludeWaitFor = v.GetBoolean();
473474
if (root.TryGetProperty("alert_long_running_query_exclude_backups", out v)) AlertLongRunningQueryExcludeBackups = v.GetBoolean();
474475
if (root.TryGetProperty("alert_long_running_query_exclude_misc_waits", out v)) AlertLongRunningQueryExcludeMiscWaits = v.GetBoolean();
476+
if (root.TryGetProperty("alert_long_running_query_exclude_cdc", out v)) AlertLongRunningQueryExcludeCdc = v.GetBoolean();
475477
if (root.TryGetProperty("alert_excluded_databases", out v) && v.ValueKind == System.Text.Json.JsonValueKind.Array)
476478
{
477479
AlertExcludedDatabases = new List<string>();

Lite/Database/DuckDbInitializer.cs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ public void Dispose()
9797
/// <summary>
9898
/// Current schema version. Increment this when schema changes require table rebuilds.
9999
/// </summary>
100-
internal const int CurrentSchemaVersion = 27;
100+
internal const int CurrentSchemaVersion = 28;
101101

102102
private readonly string _archivePath;
103103

@@ -732,6 +732,23 @@ New tables only — no existing table changes needed. Tables created by
732732
throw;
733733
}
734734
}
735+
736+
if (fromVersion < 28)
737+
{
738+
/* v28: Added is_cdc_capture flag to query_snapshots so the long-running query
739+
alert can exclude CDC capture sessions. The collector computes the flag
740+
server-side (program_name -> job_id via msdb.dbo.cdc_jobs, text fallback).
741+
Appended at the end to match the DuckDB appender's positional order. */
742+
_logger?.LogInformation("Running migration to v28: adding is_cdc_capture column to query_snapshots");
743+
try
744+
{
745+
await ExecuteNonQueryAsync(connection, "ALTER TABLE query_snapshots ADD COLUMN IF NOT EXISTS is_cdc_capture BOOLEAN DEFAULT false");
746+
}
747+
catch (Exception ex)
748+
{
749+
_logger?.LogWarning("Migration to v28 encountered an error (non-fatal): {Error}", ex.Message);
750+
}
751+
}
735752
}
736753

737754
/// <summary>

Lite/Database/Schema.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,8 @@ granted_query_memory_gb DECIMAL(18,2),
346346
host_name VARCHAR,
347347
program_name VARCHAR,
348348
open_transaction_count INTEGER,
349-
percent_complete DECIMAL(5,2)
349+
percent_complete DECIMAL(5,2),
350+
is_cdc_capture BOOLEAN DEFAULT false
350351
)";
351352

352353
public const string CreateTempdbStatsTable = @"

Lite/MainWindow.xaml.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1745,7 +1745,7 @@ await _emailAlertService.TrySendAlertEmailAsync(
17451745
{
17461746
try
17471747
{
1748-
var longRunning = await _dataService.GetLongRunningQueriesAsync(summary.ServerId, App.AlertLongRunningQueryThresholdMinutes, App.AlertLongRunningQueryMaxResults, App.AlertLongRunningQueryExcludeSpServerDiagnostics, App.AlertLongRunningQueryExcludeWaitFor, App.AlertLongRunningQueryExcludeBackups, App.AlertLongRunningQueryExcludeMiscWaits);
1748+
var longRunning = await _dataService.GetLongRunningQueriesAsync(summary.ServerId, App.AlertLongRunningQueryThresholdMinutes, App.AlertLongRunningQueryMaxResults, App.AlertLongRunningQueryExcludeSpServerDiagnostics, App.AlertLongRunningQueryExcludeWaitFor, App.AlertLongRunningQueryExcludeBackups, App.AlertLongRunningQueryExcludeMiscWaits, App.AlertLongRunningQueryExcludeCdc);
17491749

17501750
if (App.AlertExcludedDatabases.Count > 0)
17511751
{

Lite/Services/LocalDataService.WaitStats.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,8 @@ public async Task<List<LongRunningQueryInfo>> GetLongRunningQueriesAsync(
436436
bool excludeSpServerDiagnostics = true,
437437
bool excludeWaitFor = true,
438438
bool excludeBackups = true,
439-
bool excludeMiscWaits = true)
439+
bool excludeMiscWaits = true,
440+
bool excludeCdc = true)
440441
{
441442
using var connection = await OpenConnectionAsync();
442443
using var command = connection.CreateCommand();
@@ -451,6 +452,10 @@ public async Task<List<LongRunningQueryInfo>> GetLongRunningQueriesAsync(
451452
? "AND r.wait_type NOT IN (N'BACKUPTHREAD', N'BACKUPIO')" : "";
452453
string miscWaitsFilter = excludeMiscWaits
453454
? "AND r.wait_type NOT IN (N'XE_LIVE_TARGET_TVF')" : "";
455+
// CDC capture sessions are flagged server-side by the collector (is_cdc_capture). COALESCE
456+
// guards pre-migration / archived rows where the column is NULL.
457+
string cdcFilter = excludeCdc
458+
? "AND COALESCE(r.is_cdc_capture, FALSE) = FALSE" : "";
454459
maxResults = Math.Clamp(maxResults, 1, 1000);
455460

456461
command.CommandText = @$"
@@ -473,6 +478,7 @@ AND r.session_id > 50
473478
{waitForFilter}
474479
{backupsFilter}
475480
{miscWaitsFilter}
481+
{cdcFilter}
476482
AND r.total_elapsed_time_ms >= $2
477483
ORDER BY r.total_elapsed_time_ms DESC
478484
LIMIT $3;";

0 commit comments

Comments
 (0)