Make the remaining cache tests device agnostic (#9528)

adamjsanders277 · web-flow · commit 10dc32949e00 · 2026-02-23T08:05:57.000-05:00
Currently these tests are written in a way that isn't device agnostic. To fix: * update the key into device_caches to be `torch.device.current_device()` like the rest of the file. * add the pytest fixture device to `test_module_load_unload` # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because `it updates existing tests`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py
@@ -693,7 +693,9 @@ def kernel(Y, fn: tl.constexpr, fn_args):
     kernel[(1, )](y[2], func3, (3, ))
     kernel[(1, )](y[3], func4, (3, 4))
     kernel[(1, )](y[4], func1, tuple())
-    assert len(kernel.device_caches[0][0]) == 4
+
+    device = getattr(torch, device).current_device()
+    assert len(kernel.device_caches[device][0]) == 4
     assert y.tolist() == [1, 2, 3, 7, 1]
 
 
@@ -747,19 +749,21 @@ def kernel(Y, a: tl.constexpr):
         kernel.warmup(b, 0, grid=(1, ))
         kernel.warmup(b, 1, grid=(1, ))
 
+        device = getattr(torch, device).current_device()
+
         # Nothing has actually compiled yet
-        assert len(kernel.device_caches[0][0]) == 4
+        assert len(kernel.device_caches[device][0]) == 4
         assert len(pool.work_queue) == 4
 
         # Duplicates are only submitted once
         kernel.warmup(a, 0, grid=(1, ))
         kernel.warmup(a, 1, grid=(1, ))
-        assert len(kernel.device_caches[0][0]) == 4
+        assert len(kernel.device_caches[device][0]) == 4
         assert len(pool.work_queue) == 4
 
         pool.run_one()
         kernel[(1, )](a, 0)
-        assert len(kernel.device_caches[0][0]) == 4
+        assert len(kernel.device_caches[device][0]) == 4
         assert a[0, 0] == 0.0
 
         pool.run_all()
@@ -782,7 +786,8 @@ def kernel(Y, a: tl.constexpr):
         kernel.warmup(b, 0, grid=(1, ))
         kernel.warmup(b, 1, grid=(1, ))
 
-        assert len(kernel.device_caches[0][0]) == 4
+        device = getattr(torch, device).current_device()
+        assert len(kernel.device_caches[device][0]) == 4
 
         kernel[(1, )](b, 1)
         assert b[0, 0] == 1
@@ -894,7 +899,7 @@ def inc_counter(*args, **kwargs):
     assert output.item() == 31
 
 
-def test_module_load_unload(fresh_knobs):
+def test_module_load_unload(device, fresh_knobs):
 
     @triton.jit
     def kernel(out_ptr, val) -> None:
@@ -912,7 +917,7 @@ def module_unload(*args, **kwargs):
     gc.disable()
     triton.knobs.runtime.module_unload_hook.add(module_unload)
 
-    out = torch.randn(1, dtype=torch.float32, device='cuda')
+    out = torch.randn(1, dtype=torch.float32, device=device)
     pre_compile = kernel.warmup(out, 1, grid=(1, ))
     pre_compile._init_handles()