Order multi-GPU test streams

shi-eric · shi-eric · commit c47be2de3b12 · 2026-05-20T16:11:54.000-07:00
* Order multi-GPU test streams

The unified-memory verification tests exercise real cross-GPU launches
after preparing source arrays on another CUDA device stream. The source
allocation and copy are stream-ordered work, but the peer launch stream
had no explicit dependency on that source stream.

Order the peer stream after the source stream before the checked launch.
For mempool-backed source arrays, keep normal peer access enabled in the
positive execution cases while the companion rejection test disables only
mempool access. That keeps the real launch path covered and makes
mempool access the tested verifier difference without running the peer
kernel in a brittle peer-disabled pool-access state.

Signed-off-by: Eric Shi &lt;ershi@nvidia.com&gt;

Approved-by: Eric Shi &lt;ershi@nvidia.com&gt;

See merge request omniverse/warp!2399
diff --git a/warp/tests/cuda/test_unified_memory.py b/warp/tests/cuda/test_unified_memory.py
@@ -482,7 +482,7 @@ def test_unified_memory_verify_uses_peer_access_for_default_cuda_allocations(sel
             self.assertTrue(wp.can_access(peer_device, src))
 
             wp.load_module(device=peer_device)
-            wp.synchronize_device(target_device)
+            peer_device.stream.wait_stream(target_device.stream)
             with launch_verification_mode(wp.LaunchVerificationMode.CHECKED):
                 wp.launch(read_cpu_write_gpu, dim=n, inputs=[src], outputs=[dst], device=peer_device)
 
@@ -518,7 +518,7 @@ def test_unified_memory_verify_uses_parent_allocator_for_default_cuda_slices(sel
             self.assertIs(src._ref, src_base)
 
             wp.load_module(device=peer_device)
-            wp.synchronize_device(target_device)
+            peer_device.stream.wait_stream(target_device.stream)
             with launch_verification_mode(wp.LaunchVerificationMode.CHECKED):
                 wp.launch(read_cpu_write_gpu, dim=n, inputs=[src], outputs=[dst], device=peer_device)
 
@@ -565,8 +565,11 @@ def test_unified_memory_verify_uses_mempool_access_for_cuda_mempool_allocations(
         """CUDA mempool allocations use mempool-access state for cross-GPU verification.
 
         An array allocated while the source device's mempool is enabled needs
-        the CUDA mempool access predicate. This test keeps peer access disabled
-        so acceptance can only come from the allocation-specific mempool rule.
+        the CUDA mempool access predicate. The companion rejection test keeps
+        peer access enabled while mempool access is disabled, so the pair
+        isolates the allocation-specific mempool rule without executing this
+        peer kernel in a recently changed pool-access state with peer access
+        disabled.
         """
 
         target_device, peer_device = get_cuda_device_pair_with_mempool_access_support()
@@ -575,7 +578,7 @@ def test_unified_memory_verify_uses_mempool_access_for_cuda_mempool_allocations(
         peer_access_saved = wp.is_peer_access_enabled(target_device, peer_device)
         mempool_access_saved = wp.is_mempool_access_enabled(target_device, peer_device)
         try:
-            wp.set_peer_access_enabled(target_device, peer_device, False)
+            wp.set_peer_access_enabled(target_device, peer_device, True)
             wp.set_mempool_access_enabled(target_device, peer_device, True)
 
             with wp.ScopedMempool(target_device, True):
@@ -586,7 +589,7 @@ def test_unified_memory_verify_uses_mempool_access_for_cuda_mempool_allocations(
             self.assertTrue(wp.can_access(peer_device, src))
 
             wp.load_module(device=peer_device)
-            wp.synchronize_device(target_device)
+            peer_device.stream.wait_stream(target_device.stream)
             with launch_verification_mode(wp.LaunchVerificationMode.CHECKED):
                 wp.launch(read_cpu_write_gpu, dim=n, inputs=[src], outputs=[dst], device=peer_device)
 
@@ -612,7 +615,7 @@ def test_unified_memory_verify_uses_parent_allocator_for_cuda_mempool_slices(sel
         peer_access_saved = wp.is_peer_access_enabled(target_device, peer_device)
         mempool_access_saved = wp.is_mempool_access_enabled(target_device, peer_device)
         try:
-            wp.set_peer_access_enabled(target_device, peer_device, False)
+            wp.set_peer_access_enabled(target_device, peer_device, True)
             wp.set_mempool_access_enabled(target_device, peer_device, True)
 
             with wp.ScopedMempool(target_device, True):
@@ -624,7 +627,7 @@ def test_unified_memory_verify_uses_parent_allocator_for_cuda_mempool_slices(sel
             self.assertIs(src._ref, src_base)
 
             wp.load_module(device=peer_device)
-            wp.synchronize_device(target_device)
+            peer_device.stream.wait_stream(target_device.stream)
             with launch_verification_mode(wp.LaunchVerificationMode.CHECKED):
                 wp.launch(read_cpu_write_gpu, dim=n, inputs=[src], outputs=[dst], device=peer_device)