Skip to content

Commit ae96766

Browse files
committed
Fix merge error resulting in loss of LocalMemoryDevice's inlineCompletion option, which fixes the LTM.RawStringOps BDN without the need for pinning, which in turn allows it to be enabled in the CI
1 parent 537d7f9 commit ae96766

5 files changed

Lines changed: 149 additions & 105 deletions

File tree

.github/workflows/ci-bdnbenchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
- framework: net8.0
5151
gh-pages-branch: continuousbenchmark_net80
5252
configuration: [ 'Release' ]
53-
test: [ 'Operations.BasicOperations', 'Operations.ObjectOperations', 'Operations.HashObjectOperations', 'Operations.SortedSetOperations', 'Cluster.ClusterMigrate', 'Cluster.ClusterOperations', 'Lua.LuaScripts', 'Lua.LuaScriptCacheOperations','Lua.LuaRunnerOperations','Operations.CustomOperations', 'Operations.RawStringOperations', 'Operations.ScriptOperations', 'Operations.ModuleOperations', 'Operations.JsonOperations', 'Operations.PubSubOperations', 'Operations.SetOperations', 'Operations.TxnOperations', 'Network.BasicOperations', 'Network.RawStringOperations' ]
53+
test: [ 'Operations.BasicOperations', 'Operations.ObjectOperations', 'Operations.HashObjectOperations', 'Operations.SortedSetOperations', 'Cluster.ClusterMigrate', 'Cluster.ClusterOperations', 'Lua.LuaScripts', 'Lua.LuaScriptCacheOperations','Lua.LuaRunnerOperations','Operations.CustomOperations', 'Operations.RawStringOperations', 'Operations.LTM.RawStringOperations', 'Operations.ScriptOperations', 'Operations.ModuleOperations', 'Operations.JsonOperations', 'Operations.PubSubOperations', 'Operations.SetOperations', 'Operations.TxnOperations', 'Network.BasicOperations', 'Network.RawStringOperations' ]
5454
steps:
5555
- name: Check out code
5656
uses: actions/checkout@v6

benchmark/BDN.benchmark/Operations/LTM/RawStringOperations.cs

Lines changed: 8 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,18 @@ protected override void ConfigureServerOptions(GarnetServerOptions opts)
7474
opts.LogMemorySize = $"{MemoryPages * PageSizeBytes}";
7575
// Note: SegmentSize is left at its default; for LocalMemory the device segment size matches the log segment
7676
// size, and the populated working set (~PopulatePageCount pages) occupies a single segment.
77+
78+
// Use inline IO completion: DeviceCompletionThreads = 0 flows through to
79+
// Devices.CreateLogDevice(numCompletionThreads: 0) -> LocalMemoryDevice(parallelism: 0), so the copy +
80+
// completion callback run synchronously on the submitting (run) thread — there is no dedicated device
81+
// completion thread and no SPSC-ring handoff. That eliminates the cross-thread, cross-socket handoff that
82+
// otherwise produces bimodal run-to-run variance on NUMA hosts, so no process/thread pinning is needed.
83+
// (Inline completion requires latencyUs == 0, which the LocalMemoryDevice uses by default.)
84+
opts.DeviceCompletionThreads = 0;
7785
}
7886

7987
public override void GlobalSetup()
8088
{
81-
// Pin the process to a single NUMA socket BEFORE base.GlobalSetup() creates the server (and the
82-
// device's completion thread), so the run-thread and that completion thread stay co-located.
83-
PinProcessToSingleSocket();
84-
8589
base.GlobalSetup();
8690

8791
// Fixed-width keys must be sized before populating (Populate uses Key(id)). Use an upper bound on the key
@@ -110,54 +114,6 @@ public override void GlobalSetup()
110114
SetupBatch(ref decrby, KeyPrefix, id => Resp("DECRBY", Key(id), "1234567890"));
111115
}
112116

113-
/// <summary>
114-
/// Pin the whole benchmark process to a single NUMA socket so the run-thread and the
115-
/// <see cref="DeviceType.LocalMemory"/> device's completion ("processor") thread stay co-located on
116-
/// cores that share an L3 cache.
117-
/// <para>
118-
/// Why: each pending IO is handed off run-thread -&gt; SPSC ring -&gt; device completion thread -&gt;
119-
/// readyResponses drain (back on the run-thread). If the OS places those two threads on different
120-
/// sockets, every op's handoff bounces cache lines across the socket interconnect (~1.4-1.9x slower),
121-
/// which shows up as bimodal, sticky-per-launch run-to-run variance in these LTM benchmarks.
122-
/// </para>
123-
/// <para>
124-
/// Setting <em>process</em> affinity (rather than just the run-thread) is required: the device creates
125-
/// its completion thread separately, and a new thread inherits the process affinity mask, not the
126-
/// creating thread's. Setting the process mask here (before the server/device is created) constrains
127-
/// the current run-thread and every thread created afterwards (the completion thread, GC threads) to
128-
/// the one socket. Windows-only (the affinity API is unsupported elsewhere); a no-op on other OSes.
129-
/// </para>
130-
/// <para>
131-
/// TODO: Investigate Option B — pin individual threads to specific cores instead of the whole process,
132-
/// via <see cref="Native32.AffinitizeThreadShardedNuma"/>: call it on the run-thread here, and add a
133-
/// LocalMemoryDevice option to affinitize its ProcessorLoop thread to a different core on the same
134-
/// socket. That avoids constraining unrelated threads (e.g. GC), but needs a device change to expose
135-
/// completion-thread affinity.
136-
/// </para>
137-
/// <para>
138-
/// TODO: Because this pin is Windows-only, the LTM benchmark has been removed from the BDN CI perf gate
139-
/// (ci-bdnbenchmark.yml test matrix + BDN_Benchmark_Config.json) — on the Linux CI it would run unpinned
140-
/// and the bimodal cross-socket variance would flake the gate. Restore it to CI once cross-platform
141-
/// pinning (Option B and/or Linux affinity support) keeps the LTM numbers stable on the CI runners.
142-
/// </para>
143-
/// </summary>
144-
private static void PinProcessToSingleSocket()
145-
{
146-
if (!OperatingSystem.IsWindows())
147-
return;
148-
149-
// Assume two NUMA sockets with contiguous logical-processor enumeration (procs [0, N/2) = socket 0),
150-
// matching Tsavorite's Native32.AffinitizeThreadShardedNuma(_, 2) convention. Assumes a single
151-
// processor group (<= 64 logical processors), so a single IntPtr mask suffices.
152-
var socketProcs = Environment.ProcessorCount / 2;
153-
if (socketProcs is < 1 or > 63)
154-
return;
155-
156-
var mask = (1L << socketProcs) - 1;
157-
using var process = Process.GetCurrentProcess();
158-
process.ProcessorAffinity = (nint)mask;
159-
}
160-
161117
/// <summary>
162118
/// Populate fresh keys (each set to "0") until <see cref="PopulatePageCount"/> pages have been appended to the log.
163119
/// Returns the number of keys populated.

benchmark/BDN.benchmark/Operations/OperationsBase.cs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,13 @@ public virtual void GlobalSetup()
101101
{
102102
// Nothing to create here: the device is built downstream by GarnetServerOptions.GetSettings()
103103
// (called from GarnetServer.CreateStore). Its EnableStorageTier branch calls GetInitializedDeviceFactory()
104-
// -> LocalStorageNamedDeviceFactory.Get() -> Devices.CreateLogDevice(deviceType: LocalMemory), which builds a
105-
// LocalMemoryDevice with latencyUs:0 (a pure in-memory device, so we measure the Tsavorite pending codepaths
106-
// rather than real device IO), and assigns it to kvSettings.LogDevice before TsavoriteKV is constructed.
104+
// -> LocalStorageNamedDeviceFactory.Get() -> Devices.CreateLogDevice(deviceType: LocalMemory,
105+
// numCompletionThreads: opts.DeviceCompletionThreads), which builds a LocalMemoryDevice with latencyUs:0
106+
// (a pure in-memory device, so we measure the Tsavorite pending codepaths rather than real device IO),
107+
// and assigns it to kvSettings.LogDevice before TsavoriteKV is constructed. A derived benchmark that sets
108+
// DeviceCompletionThreads = 0 selects parallelism:0 (inline completion: copy + callback on the submitting
109+
// thread, no completion thread or ring handoff) — see LTM.RawStringOperations, which uses that to avoid the
110+
// cross-thread/cross-socket handoff variance without any process/thread pinning.
107111
//
108112
// The only requirement on our side is that the precondition for that branch holds, so fail loudly on a
109113
// misconfiguration that would otherwise be silently downgraded to a NullDevice (the non-tiered fallback).

0 commit comments

Comments
 (0)