microsoft
diff --git a/‎.github/workflows/ci-bdnbenchmark.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-bdnbenchmark.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/BDN.benchmark/Operations/LTM/RawStringOperations.cs‎
Lines changed: 8 additions & 52 deletions b/‎benchmark/BDN.benchmark/Operations/LTM/RawStringOperations.cs‎
Lines changed: 8 additions & 52 deletions
diff --git a/‎benchmark/BDN.benchmark/Operations/OperationsBase.cs‎
Lines changed: 7 additions & 3 deletions b/‎benchmark/BDN.benchmark/Operations/OperationsBase.cs‎
Lines changed: 7 additions & 3 deletions
@@ -50,7 +50,7 @@ jobs:
           - framework: net8.0
             gh-pages-branch: continuousbenchmark_net80
         configuration: [ 'Release' ]
-        test: [ 'Operations.BasicOperations', 'Operations.ObjectOperations', 'Operations.HashObjectOperations', 'Operations.SortedSetOperations', 'Cluster.ClusterMigrate', 'Cluster.ClusterOperations', 'Lua.LuaScripts', 'Lua.LuaScriptCacheOperations','Lua.LuaRunnerOperations','Operations.CustomOperations', 'Operations.RawStringOperations', 'Operations.ScriptOperations', 'Operations.ModuleOperations', 'Operations.JsonOperations', 'Operations.PubSubOperations', 'Operations.SetOperations', 'Operations.TxnOperations', 'Network.BasicOperations', 'Network.RawStringOperations' ]
+        test: [ 'Operations.BasicOperations', 'Operations.ObjectOperations', 'Operations.HashObjectOperations', 'Operations.SortedSetOperations', 'Cluster.ClusterMigrate', 'Cluster.ClusterOperations', 'Lua.LuaScripts', 'Lua.LuaScriptCacheOperations','Lua.LuaRunnerOperations','Operations.CustomOperations', 'Operations.RawStringOperations', 'Operations.LTM.RawStringOperations', 'Operations.ScriptOperations', 'Operations.ModuleOperations', 'Operations.JsonOperations', 'Operations.PubSubOperations', 'Operations.SetOperations', 'Operations.TxnOperations', 'Network.BasicOperations', 'Network.RawStringOperations' ]
     steps:
       - name: Check out code
         uses: actions/checkout@v6
 
@@ -74,14 +74,18 @@ protected override void ConfigureServerOptions(GarnetServerOptions opts)
             opts.LogMemorySize = $"{MemoryPages * PageSizeBytes}";
             // Note: SegmentSize is left at its default; for LocalMemory the device segment size matches the log segment
             // size, and the populated working set (~PopulatePageCount pages) occupies a single segment.
+
+            // Use inline IO completion: DeviceCompletionThreads = 0 flows through to
+            // Devices.CreateLogDevice(numCompletionThreads: 0) -> LocalMemoryDevice(parallelism: 0), so the copy +
+            // completion callback run synchronously on the submitting (run) thread — there is no dedicated device
+            // completion thread and no SPSC-ring handoff. That eliminates the cross-thread, cross-socket handoff that
+            // otherwise produces bimodal run-to-run variance on NUMA hosts, so no process/thread pinning is needed.
+            // (Inline completion requires latencyUs == 0, which the LocalMemoryDevice uses by default.)
+            opts.DeviceCompletionThreads = 0;
         }
 
         public override void GlobalSetup()
         {
-            // Pin the process to a single NUMA socket BEFORE base.GlobalSetup() creates the server (and the
-            // device's completion thread), so the run-thread and that completion thread stay co-located.
-            PinProcessToSingleSocket();
-
             base.GlobalSetup();
 
             // Fixed-width keys must be sized before populating (Populate uses Key(id)). Use an upper bound on the key
@@ -110,54 +114,6 @@ public override void GlobalSetup()
             SetupBatch(ref decrby, KeyPrefix, id => Resp("DECRBY", Key(id), "1234567890"));
         }
 
-        /// <summary>
-        /// Pin the whole benchmark process to a single NUMA socket so the run-thread and the
-        /// <see cref="DeviceType.LocalMemory"/> device's completion ("processor") thread stay co-located on
-        /// cores that share an L3 cache.
-        /// <para>
-        /// Why: each pending IO is handed off run-thread -&gt; SPSC ring -&gt; device completion thread -&gt;
-        /// readyResponses drain (back on the run-thread). If the OS places those two threads on different
-        /// sockets, every op's handoff bounces cache lines across the socket interconnect (~1.4-1.9x slower),
-        /// which shows up as bimodal, sticky-per-launch run-to-run variance in these LTM benchmarks.
-        /// </para>
-        /// <para>
-        /// Setting <em>process</em> affinity (rather than just the run-thread) is required: the device creates
-        /// its completion thread separately, and a new thread inherits the process affinity mask, not the
-        /// creating thread's. Setting the process mask here (before the server/device is created) constrains
-        /// the current run-thread and every thread created afterwards (the completion thread, GC threads) to
-        /// the one socket. Windows-only (the affinity API is unsupported elsewhere); a no-op on other OSes.
-        /// </para>
-        /// <para>
-        /// TODO: Investigate Option B — pin individual threads to specific cores instead of the whole process,
-        /// via <see cref="Native32.AffinitizeThreadShardedNuma"/>: call it on the run-thread here, and add a
-        /// LocalMemoryDevice option to affinitize its ProcessorLoop thread to a different core on the same
-        /// socket. That avoids constraining unrelated threads (e.g. GC), but needs a device change to expose
-        /// completion-thread affinity.
-        /// </para>
-        /// <para>
-        /// TODO: Because this pin is Windows-only, the LTM benchmark has been removed from the BDN CI perf gate
-        /// (ci-bdnbenchmark.yml test matrix + BDN_Benchmark_Config.json) — on the Linux CI it would run unpinned
-        /// and the bimodal cross-socket variance would flake the gate. Restore it to CI once cross-platform
-        /// pinning (Option B and/or Linux affinity support) keeps the LTM numbers stable on the CI runners.
-        /// </para>
-        /// </summary>
-        private static void PinProcessToSingleSocket()
-        {
-            if (!OperatingSystem.IsWindows())
-                return;
-
-            // Assume two NUMA sockets with contiguous logical-processor enumeration (procs [0, N/2) = socket 0),
-            // matching Tsavorite's Native32.AffinitizeThreadShardedNuma(_, 2) convention. Assumes a single
-            // processor group (<= 64 logical processors), so a single IntPtr mask suffices.
-            var socketProcs = Environment.ProcessorCount / 2;
-            if (socketProcs is < 1 or > 63)
-                return;
-
-            var mask = (1L << socketProcs) - 1;
-            using var process = Process.GetCurrentProcess();
-            process.ProcessorAffinity = (nint)mask;
-        }
-
         /// <summary>
         /// Populate fresh keys (each set to "0") until <see cref="PopulatePageCount"/> pages have been appended to the log.
         /// Returns the number of keys populated.
 
@@ -101,9 +101,13 @@ public virtual void GlobalSetup()
             {
                 // Nothing to create here: the device is built downstream by GarnetServerOptions.GetSettings()
                 // (called from GarnetServer.CreateStore). Its EnableStorageTier branch calls GetInitializedDeviceFactory()
-                // -> LocalStorageNamedDeviceFactory.Get() -> Devices.CreateLogDevice(deviceType: LocalMemory), which builds a
-                // LocalMemoryDevice with latencyUs:0 (a pure in-memory device, so we measure the Tsavorite pending codepaths
-                // rather than real device IO), and assigns it to kvSettings.LogDevice before TsavoriteKV is constructed.
+                // -> LocalStorageNamedDeviceFactory.Get() -> Devices.CreateLogDevice(deviceType: LocalMemory,
+                // numCompletionThreads: opts.DeviceCompletionThreads), which builds a LocalMemoryDevice with latencyUs:0
+                // (a pure in-memory device, so we measure the Tsavorite pending codepaths rather than real device IO),
+                // and assigns it to kvSettings.LogDevice before TsavoriteKV is constructed. A derived benchmark that sets
+                // DeviceCompletionThreads = 0 selects parallelism:0 (inline completion: copy + callback on the submitting
+                // thread, no completion thread or ring handoff) — see LTM.RawStringOperations, which uses that to avoid the
+                // cross-thread/cross-socket handoff variance without any process/thread pinning.
                 //
                 // The only requirement on our side is that the precondition for that branch holds, so fail loudly on a
                 // misconfiguration that would otherwise be silently downgraded to a NullDevice (the non-tiered fallback).