@@ -74,14 +74,18 @@ protected override void ConfigureServerOptions(GarnetServerOptions opts)
7474 opts . LogMemorySize = $ "{ MemoryPages * PageSizeBytes } ";
7575 // Note: SegmentSize is left at its default; for LocalMemory the device segment size matches the log segment
7676 // size, and the populated working set (~PopulatePageCount pages) occupies a single segment.
77+
78+ // Use inline IO completion: DeviceCompletionThreads = 0 flows through to
79+ // Devices.CreateLogDevice(numCompletionThreads: 0) -> LocalMemoryDevice(parallelism: 0), so the copy +
80+ // completion callback run synchronously on the submitting (run) thread — there is no dedicated device
81+ // completion thread and no SPSC-ring handoff. That eliminates the cross-thread, cross-socket handoff that
82+ // otherwise produces bimodal run-to-run variance on NUMA hosts, so no process/thread pinning is needed.
83+ // (Inline completion requires latencyUs == 0, which the LocalMemoryDevice uses by default.)
84+ opts . DeviceCompletionThreads = 0 ;
7785 }
7886
7987 public override void GlobalSetup ( )
8088 {
81- // Pin the process to a single NUMA socket BEFORE base.GlobalSetup() creates the server (and the
82- // device's completion thread), so the run-thread and that completion thread stay co-located.
83- PinProcessToSingleSocket ( ) ;
84-
8589 base . GlobalSetup ( ) ;
8690
8791 // Fixed-width keys must be sized before populating (Populate uses Key(id)). Use an upper bound on the key
@@ -110,54 +114,6 @@ public override void GlobalSetup()
110114 SetupBatch ( ref decrby , KeyPrefix , id => Resp ( "DECRBY" , Key ( id ) , "1234567890" ) ) ;
111115 }
112116
113- /// <summary>
114- /// Pin the whole benchmark process to a single NUMA socket so the run-thread and the
115- /// <see cref="DeviceType.LocalMemory"/> device's completion ("processor") thread stay co-located on
116- /// cores that share an L3 cache.
117- /// <para>
118- /// Why: each pending IO is handed off run-thread -> SPSC ring -> device completion thread ->
119- /// readyResponses drain (back on the run-thread). If the OS places those two threads on different
120- /// sockets, every op's handoff bounces cache lines across the socket interconnect (~1.4-1.9x slower),
121- /// which shows up as bimodal, sticky-per-launch run-to-run variance in these LTM benchmarks.
122- /// </para>
123- /// <para>
124- /// Setting <em>process</em> affinity (rather than just the run-thread) is required: the device creates
125- /// its completion thread separately, and a new thread inherits the process affinity mask, not the
126- /// creating thread's. Setting the process mask here (before the server/device is created) constrains
127- /// the current run-thread and every thread created afterwards (the completion thread, GC threads) to
128- /// the one socket. Windows-only (the affinity API is unsupported elsewhere); a no-op on other OSes.
129- /// </para>
130- /// <para>
131- /// TODO: Investigate Option B — pin individual threads to specific cores instead of the whole process,
132- /// via <see cref="Native32.AffinitizeThreadShardedNuma"/>: call it on the run-thread here, and add a
133- /// LocalMemoryDevice option to affinitize its ProcessorLoop thread to a different core on the same
134- /// socket. That avoids constraining unrelated threads (e.g. GC), but needs a device change to expose
135- /// completion-thread affinity.
136- /// </para>
137- /// <para>
138- /// TODO: Because this pin is Windows-only, the LTM benchmark has been removed from the BDN CI perf gate
139- /// (ci-bdnbenchmark.yml test matrix + BDN_Benchmark_Config.json) — on the Linux CI it would run unpinned
140- /// and the bimodal cross-socket variance would flake the gate. Restore it to CI once cross-platform
141- /// pinning (Option B and/or Linux affinity support) keeps the LTM numbers stable on the CI runners.
142- /// </para>
143- /// </summary>
144- private static void PinProcessToSingleSocket ( )
145- {
146- if ( ! OperatingSystem . IsWindows ( ) )
147- return ;
148-
149- // Assume two NUMA sockets with contiguous logical-processor enumeration (procs [0, N/2) = socket 0),
150- // matching Tsavorite's Native32.AffinitizeThreadShardedNuma(_, 2) convention. Assumes a single
151- // processor group (<= 64 logical processors), so a single IntPtr mask suffices.
152- var socketProcs = Environment . ProcessorCount / 2 ;
153- if ( socketProcs is < 1 or > 63 )
154- return ;
155-
156- var mask = ( 1L << socketProcs ) - 1 ;
157- using var process = Process . GetCurrentProcess ( ) ;
158- process . ProcessorAffinity = ( nint ) mask ;
159- }
160-
161117 /// <summary>
162118 /// Populate fresh keys (each set to "0") until <see cref="PopulatePageCount"/> pages have been appended to the log.
163119 /// Returns the number of keys populated.
0 commit comments