Skip to content

Commit 63a834d

Browse files
authored
bugfix: Ninja race condition fix (#2339)
<!-- .github/pull_request_template.md --> ## 📌 Description **Fix race condition in JIT compilation for multi-GPU/multi-process environments** This PR resolves a race condition in multi-process environments where concurrent Ninja builds contended for a shared `.ninja_log` file (`ninja: error: opening build log: No such file or directory`), causing intermittent failures when multiple processes compile different FlashInfer modules simultaneously. Key changes: 1. Isolated workdirs for runtime JIT: Each module now builds in its own subdirectory (`cached_ops/<module>/`), isolating .ninja_log files between concurrent builds 2. Absolute output paths: Ninja build files use absolute paths for object and output files, ensuring correct output locations regardless of workdir 3. Preserved AOT parallelism: Batch builds retain the `subninja` approach, maintaining full parallel compilation Aspect | Before | After -- | -- | -- Runtime JIT workdir | Shared `cached_ops/` | Isolated `cached_ops/<module>/` AOT batch workdir | `cached_ops/` | `cached_ops/` (unchanged) `.ninja_log` conflict | Yes (race condition) | No (different workdir levels) Ninja output paths | Relative (`$name/$name.so`) | Absolute (`/path/to/cached_ops/<module>/<module>.so`) Output locations | `cached_ops/<module>/<module>.so` | `cached_ops/<module>/<module>.so` (unchanged) AOT parallelism | Full (`subninja`) | Full (`subninja`) - preserved <!-- What does this PR do? Briefly describe the changes and why they’re needed. --> ## 🔍 Related Issues <!-- Link any related issues here --> #2338 ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes <!-- Optional: anything you'd like reviewers to focus on, concerns, etc. --> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Refactor** * Build system now isolates per-spec build outputs into dedicated build directories and runs per-spec compilation with top-level artifact targets to avoid cross-spec interference and race conditions. * **New Features** * Added a public per-spec build_dir property to configure where each specification’s build outputs are placed. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 0e9a89d commit 63a834d

2 files changed

Lines changed: 17 additions & 7 deletions

File tree

flashinfer/jit/core.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ class JitSpec:
227227
def ninja_path(self) -> Path:
228228
return jit_env.FLASHINFER_JIT_DIR / self.name / "build.ninja"
229229

230+
@property
231+
def build_dir(self) -> Path:
232+
return jit_env.FLASHINFER_JIT_DIR / self.name
233+
230234
@property
231235
def jit_library_path(self) -> Path:
232236
return jit_env.FLASHINFER_JIT_DIR / self.name / f"{self.name}.so"
@@ -238,7 +242,7 @@ def get_library_path(self) -> Path:
238242

239243
def get_object_paths(self) -> List[Path]:
240244
object_paths = []
241-
jit_dir = self.jit_library_path.parent
245+
jit_dir = self.build_dir
242246
for source in self.sources:
243247
is_cuda = source.suffix == ".cu"
244248
object_suffix = ".cuda.o" if is_cuda else ".o"
@@ -264,7 +268,7 @@ def lock_path(self) -> Path:
264268

265269
def write_ninja(self) -> None:
266270
ninja_path = self.ninja_path
267-
ninja_path.parent.mkdir(parents=True, exist_ok=True)
271+
self.build_dir.mkdir(parents=True, exist_ok=True)
268272
content = generate_ninja_build_for_op(
269273
name=self.name,
270274
sources=self.sources,
@@ -295,7 +299,7 @@ def build(self, verbose: bool, need_lock: bool = True) -> None:
295299
# Write ninja file if it doesn't exist (deferred case)
296300
if not self.is_ninja_generated:
297301
self.write_ninja()
298-
run_ninja(jit_env.FLASHINFER_JIT_DIR, self.ninja_path, verbose)
302+
run_ninja(self.build_dir, self.ninja_path, verbose)
299303

300304
def load(self, so_path: Path):
301305
return tvm_ffi.load_module(str(so_path))
@@ -362,7 +366,7 @@ def expand_flags(
362366
nvcc = os.environ.get("FLASHINFER_NVCC", f"{cuda_home}/bin/nvcc")
363367

364368
# Build directory
365-
build_dir = str(self.jit_library_path.parent.resolve())
369+
build_dir = str(self.build_dir.resolve())
366370

367371
# Generate entries for each source file
368372
compile_commands = []

flashinfer/jit/cpp_ext.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,20 +276,26 @@ def generate_ninja_build_for_op(
276276
]
277277
)
278278

279+
# Use absolute paths for outputs so ninja files work with any workdir
280+
# This enables isolated workdirs for runtime JIT (avoiding .ninja_log races)
281+
# while still supporting subninja for parallel AOT builds
282+
output_dir = jit_env.FLASHINFER_JIT_DIR / name
283+
279284
objects = []
280285
for source in sources:
281286
is_cuda = source.suffix == ".cu"
282287
object_suffix = ".cuda.o" if is_cuda else ".o"
283288
cmd = "cuda_compile" if is_cuda else "compile"
284289
obj_name = source.with_suffix(object_suffix).name
285-
obj = f"$name/{obj_name}"
290+
obj = str((output_dir / obj_name).resolve())
286291
objects.append(obj)
287292
lines.append(f"build {obj}: {cmd} {source.resolve()}")
288293

289294
lines.append("")
290295
link_rule = "nvcc_link" if needs_device_linking else "link"
291-
lines.append(f"build $name/$name.so: {link_rule} " + " ".join(objects))
292-
lines.append("default $name/$name.so")
296+
output_so = str((output_dir / f"{name}.so").resolve())
297+
lines.append(f"build {output_so}: {link_rule} " + " ".join(objects))
298+
lines.append(f"default {output_so}")
293299
lines.append("")
294300

295301
return "\n".join(lines)

0 commit comments

Comments
 (0)