enabling CI for MPI tests, cleanup

fschlimb · fschlimb · commit 3f41e3c0cd10 · 2026-02-09T08:42:20.000-08:00
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -33,3 +33,9 @@ jobs:
         run: |
           export FILECHECK=FileCheck-18 # Ubuntu's llvm-dev appends a version number.
           uv run lit examples --verbose  # Makes sure to substitute FileCheck for $FILECHECK
+
+      - name: Run lit-enabled examples which use mpi as tests
+        run: |
+          export FILECHECK=FileCheck-18 # Ubuntu's llvm-dev appends a version number.
+          uv sync --extra runtime_mpi
+          uv run lit examples/mlp-mpi --verbose  # Makes sure to substitute FileCheck for $FILECHECK
diff --git a/examples/mlp-mpi/mlp-mpi.py b/examples/mlp-mpi/mlp-mpi.py
@@ -1,4 +1,5 @@
-# RUN: %PYTHON %s | FileCheck %s
+# REQUIRES: mpi4py
+# RUN: mpirun -n 4 %PYTHON %s | FileCheck %s
 # CHECK: PASSED
 """
 A single MLP that can run on multiple MPI ranks,
@@ -30,6 +31,17 @@
 from mpi4py import MPI
 
 
+if not MPI.Is_initialized():
+    MPI.Init()
+P = MPI.COMM_WORLD.Get_size()
+R = MPI.COMM_WORLD.Get_rank()
+
+
+def rprint(*args, **kwargs):
+    if R == 0:
+        print(*args, **kwargs)
+
+
 def parse_cla():
     parser = argparse.ArgumentParser(
         description="MLP on MPI using MLIR",
@@ -40,7 +52,7 @@ def parse_cla():
         "-s",
         type=int,
         nargs=3,
-        default=[4096, 4096, 4096],
+        default=[64, 128, 32],
         help="M,N,K matrix sizes (Activations=MxK, WeightsIn=KxN, WeightsOut=MxN, Result=MxK).",
     )
     parser.add_argument(
@@ -97,7 +109,7 @@ def __init__(self, args, P: int, R: int):
         self.verbose = args.verbose
 
     def _alloc_inout(self, execution_engine: ExecutionEngine) -> list[ctypes.Structure]:
-        print(" * Allocating input/output arrays...")
+        rprint(" * Allocating input/output arrays...")
         memrefs = [
             make_nd_memref_descriptor(2, as_ctype(self.dtype))() for _ in range(4)
         ]
@@ -106,7 +118,7 @@ def _alloc_inout(self, execution_engine: ExecutionEngine) -> list[ctypes.Structu
         return memrefs
 
     def _init_inout(self, r: np.ndarray, a: np.ndarray, b: np.ndarray, c: np.ndarray):
-        print(" * Initializing input arrays...")
+        rprint(" * Initializing input arrays...")
         np.random.seed(self.R)
         # R = ranked_memref_to_numpy([r])
         A = ranked_memref_to_numpy([a])
@@ -128,7 +140,7 @@ def allocate_inputs(self, execution_engine: ExecutionEngine):
             pass
 
     def _reference_solution(self, execution_engine: ExecutionEngine) -> np.ndarray:
-        print(" * Gathering input data...")
+        rprint(" * Gathering input data...")
         gathered = []
         for i, v in enumerate(["act", "win", "wout"]):
             memref = make_nd_memref_descriptor(2, as_ctype(self.dtype))()
@@ -139,7 +151,7 @@ def _reference_solution(self, execution_engine: ExecutionEngine) -> np.ndarray:
             )
             gathered.append(ranked_memref_to_numpy([memref]))
 
-        print(" * Computing reference solution...")
+        rprint(" * Computing reference solution...")
 
         def sigmoid(z):
             return 1 / (1 + np.exp(-z))
@@ -153,15 +165,16 @@ def check_correctness(
         R = ranked_memref_to_numpy([self._input_arrays[0]])
         R_ref = self._reference_solution(execution_engine)
         if verbose > 1:
-            print("Reference solution:")
-            print(R_ref)
-            print("Computed solution:")
-            print(R)
+            rprint("Reference solution:")
+            rprint(R_ref)
+            rprint("Computed solution:")
+            rprint(R)
         success = np.allclose(R, R_ref)
+        success = MPI.COMM_WORLD.allreduce(success, op=MPI.LAND)
         if success:
-            print("PASSED")
+            rprint("PASSED")
         else:
-            print("FAILED Result mismatch!")
+            rprint("FAILED Result mismatch!")
         return success
 
     def shared_libs(self) -> list[str]:
@@ -182,7 +195,7 @@ def get_complexity(self) -> tuple[int, int, int]:
 
     def payload_module(self) -> ir.Module:
         if self.griddims == 1:
-            print(f"Using 1D grid of size {self.P}")
+            rprint(f"Using 1D grid of size {self.P}")
             grid = self.P
         elif self.griddims == 2:
             # find two factors of P that are as close as possible
@@ -193,14 +206,14 @@ def find_factors(n):
                 return (1, n)
 
             p1, p2 = find_factors(self.P)
-            print(f"Using 2D grid of size {p1}x{p2}")
+            rprint(f"Using 2D grid of size {p1}x{p2}")
             grid = f"{p1}x{p2}"
         else:
             raise ValueError(
                 f"Only 1D and 2D grids are supported (not {self.griddims}d).\n"
             )
 
-        fname = "mlp_weight_stationary.mlir"
+        fname = Path(__file__).parent / "mlp_weight_stationary.mlir"
         with open(fname, "r") as f:
             txt = f.read()
 
@@ -247,10 +260,10 @@ def find_factors(n):
         txt = txt.format_map(format_values)
 
         if self.verbose > 1:
-            print("Payload MLIR:")
+            rprint("Payload MLIR:")
             count = 1
             for line in txt.splitlines():
-                print(str(count) + "\t" + line)
+                rprint(str(count) + "\t" + line)
                 count += 1
 
         return ir.Module.parse(txt)
@@ -340,22 +353,22 @@ def schedule_module(
     with ir.Context(), ir.Location.unknown():
         wload = DistMLP(args, P, R)
 
-        print(" Execute".center(60, "-"))
+        rprint(" Execute".center(60, "-"))
         execute(wload, verbose=args.verbose)
 
-        # print(" Execute 2 ".center(60, "-"))
+        # rprint(" Execute 2 ".center(60, "-"))
         # execute(wload, verbose=1)
 
-        # print(" Benchmark ".center(60, "-"))
+        # rprint(" Benchmark ".center(60, "-"))
         # times = benchmark(wload)
         # times *= 1e6  # convert to microseconds
         # compute statistics
         # mean = np.mean(times)
         # min = np.min(times)
         # max = np.max(times)
         # std = np.std(times)
-        # print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+        # rprint(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
         # flop_count = wload.get_complexity()[0]
         # gflops = flop_count / (mean * 1e-6) / 1e9
-        # print(f"Throughput: {gflops:.2f} GFLOPS")
+        # rprint(f"Throughput: {gflops:.2f} GFLOPS")
     MPI.Finalize()
diff --git a/examples/mlp-mpi/mlp_weight_stationary.mlir b/examples/mlp-mpi/mlp_weight_stationary.mlir
@@ -84,11 +84,6 @@ module attributes {{mpi.dlti = #dlti.map<"MPI:Implementation" = "MPICH", "MPI:co
     return %ret_a : tensor<{M}x{K}xf32>
   }}
 
-  // func.func @gather(%t:tensor<5x3xi32>) -> tensor<5x12xi32> attributes {{llvm.emit_c_interface}} {{
-  //   %r = shard.all_gather %t on @grid0 grid_axes = [0] gather_axis = 1 : tensor<5x3xi32> -> tensor<5x12xi32>
-  //   return %r : tensor<5x12xi32>
-  // }}
-
   func.func @gather_act(%arg0: tensor<{M}x{K}xf32>) -> tensor<{M}x{K}xf32> attributes {{llvm.emit_c_interface}} {{
     %sharding = shard.sharding @grid0 split_axes = {split_act} : !shard.sharding
     %sharding_g = shard.sharding @grid0 split_axes = [[]] : !shard.sharding
diff --git a/lit.cfg.py b/lit.cfg.py
@@ -20,8 +20,9 @@
 if filecheck_path := os.environ.get("FILECHECK"):
     config.substitutions.append(("FileCheck", filecheck_path))
 
-if importlib.util.find_spec("torch"):
-    config.available_features.add("torch")
+for pkg in ["torch", "mpi4py", "mpich", "openmpi", "impi-rt"]:
+    if importlib.util.find_spec(pkg):
+        config.available_features.add(pkg)
 
 torch_kernels_dir = project_root + "/third_party/KernelBench/KernelBench"
 if os.path.isdir(torch_kernels_dir):
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ dev = [
 
 [project.optional-dependencies]
 ingress_torch_mlir = [
-    "torch-mlir==20260125.703",
+    "torch-mlir==20260209.718",
     "ml_dtypes",
 ]
 # Additional "targets" which pull in optional dependencies -- use `uv sync --extra TARGET`
@@ -39,6 +39,10 @@ ingress_torch_xpu = [
     "pytorch_triton_xpu",  # Transitive dependency listed explicitly so that we can state which package repository it is supposed to come from
     "lighthouse[ingress_torch_mlir]"
 ]
+runtime_mpi = [
+    "mpi4py",
+    "impi-rt"
+]
 
 [tool.uv]
 # Declare that the following "targets" are mutually exclusive of one another