cleanup

ngc92 · ngc92 · commit 3032d236687b · 2026-03-04T02:29:13.000+01:00
diff --git a/csrc/manager.cpp b/csrc/manager.cpp
@@ -59,19 +59,32 @@ BenchmarkManager::BenchmarkManager(int result_fd, int signature_fd, std::uint64_
     CUDA_CHECK(cudaMalloc(&mDeviceDummyMemory, 2 * mL2CacheSize));
     // allocate a large arena (2MiB) to place the error counter in
     CUDA_CHECK(cudaMalloc(&mDeviceErrorBase, ArenaSize));
-    mOutputFile = fdopen(result_fd, "w");
+    mOutputPipe = fdopen(result_fd, "w");
+    if (!mOutputPipe) {
+        throw std::runtime_error("Could not open output pipe");
+    }
+
     mNVTXEnabled = nvtx;
     mDiscardCache = discard;
     mSeed = seed;
     char sig_buf[256];
     FILE* sig_file = fdopen(signature_fd, "r");
-    fgets(sig_buf, sizeof(sig_buf), sig_file);
+    if (!sig_file) {
+        throw std::runtime_error("Could not open signature pipe");
+    }
+    if (!fgets(sig_buf, sizeof(sig_buf), sig_file)) {
+        fclose(sig_file);
+        throw std::runtime_error("Could not read signature");
+    }
     fclose(sig_file);
     mSignature = std::string(sig_buf);
 }
 
 BenchmarkManager::~BenchmarkManager() {
-    fclose(mOutputFile);
+    if (mOutputPipe) {
+        fclose(mOutputPipe);
+        mOutputPipe = nullptr;
+    }
     cudaFree(mDeviceDummyMemory);
     cudaFree(mDeviceErrorBase);
     for (auto& event : mStartEvents) cudaEventDestroy(event);
@@ -315,7 +328,7 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
     }
     std::sort(empty_event_times.begin(), empty_event_times.end());
     float median = empty_event_times.at(empty_event_times.size() / 2);
-    fprintf(mOutputFile, "event-overhead\t%f µs\n", median * 1000);
+    fprintf(mOutputPipe, "event-overhead\t%f µs\n", median * 1000);
 
     // create a randomized order for running the tests
     std::vector<int> test_order(actual_calls);
@@ -368,16 +381,16 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
     error_count -= mErrorCountShift;
 
     if (error_count > 0) {
-        fprintf(mOutputFile, "error-count\t%u\n", error_count);
+        fprintf(mOutputPipe, "error-count\t%u\n", error_count);
     }
 
     for (int i = 0; i < actual_calls; i++) {
         float duration;
         CUDA_CHECK(cudaEventElapsedTime(&duration, mStartEvents.at(i), mEndEvents.at(i)));
-        fprintf(mOutputFile, "%d\t%f\n", test_order.at(i) - 1, duration * 1000);
+        fprintf(mOutputPipe, "%d\t%f\n", test_order.at(i) - 1, duration * 1000);
     }
-    fprintf(mOutputFile, "signature\t%s", mSignature.c_str());
-    fflush(mOutputFile);
+    fprintf(mOutputPipe, "signature\t%s\n", mSignature.c_str());
+    fflush(mOutputPipe);
 
     // cleanup events
     for (auto& event : mStartEvents) CUDA_CHECK(cudaEventDestroy(event));
diff --git a/csrc/manager.h b/csrc/manager.h
@@ -7,6 +7,7 @@
 
 #include <functional>
 #include <chrono>
+#include <cstdio>
 #include <fstream>
 #include <cuda_runtime.h>
 #include <optional>
@@ -66,7 +67,7 @@ class BenchmarkManager {
     std::uint64_t mSeed = -1;
     std::vector<Expected> mExpectedOutputs;
 
-    FILE* mOutputFile;
+    FILE* mOutputPipe = nullptr;
     std::string mSignature;
 
     static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream);
diff --git a/python/pygpubench/__init__.py b/python/pygpubench/__init__.py
@@ -1,17 +1,20 @@
 import dataclasses
 import math
-import multiprocessing
 import multiprocessing as mp
 import os
 import traceback
 import secrets
 
-from typing import Optional
+from typing import Optional, TYPE_CHECKING
 
 from . import _pygpubench
 from ._types import *
 from .utils import DeterministicContext
 
+if TYPE_CHECKING:
+    import multiprocessing.connection
+
+
 __all__ = [
     "do_bench_impl",
     "do_bench_isolated",
@@ -25,9 +28,9 @@
 ]
 
 
-def do_bench_impl(out_fd: "multiprocessing.Pipe", signature: "multiprocessing.Pipe", qualname: str, test_generator: TestGeneratorInterface,
+def do_bench_impl(out_fd: "multiprocessing.connection.Connection", signature: "multiprocessing.connection.Connection", qualname: str, test_generator: TestGeneratorInterface,
                   test_args: dict, repeats: int, seed: int, stream: int = None, discard: bool = True,
-                  nvtx: bool = False, tb_conn: "multiprocessing.Pipe" = None):
+                  nvtx: bool = False, tb_conn: "multiprocessing.connection.Connection" = None):
     """
     Benchmarks the kernel referred to by `qualname` against the test case returned by `test_generator`.
     :param out_fd: Writable file descriptor to which benchmark results are written.
@@ -119,6 +122,13 @@ def basic_stats(time_us: list[float]) -> BenchmarkStats:
     return BenchmarkStats(runs, len(time_us), fastest, slowest, median, mean, std, err)
 
 
+def read_all(fd: int) -> str:
+    chunks = []
+    while chunk := os.read(fd, 65536):
+        chunks.append(chunk)
+    return (b"".join(chunks)).decode()
+
+
 def do_bench_isolated(
         qualname: str,
         test_generator: TestGeneratorInterface,
@@ -189,6 +199,7 @@ def do_bench_isolated(
     if process.is_alive():
         process.kill()
         process.join()
+        parent_tb_conn.close()
         result_parent.close()
         raise RuntimeError(
             f"Benchmark subprocess timed out after {timeout}s -- "
@@ -208,25 +219,37 @@ def do_bench_isolated(
         raise RuntimeError(msg)
 
     # Child has exited and closed its write-end, so this read is bounded.
-    raw = os.read(read_fd, _PIPE_CAPACITY)
+    response = read_all(read_fd)
     result_parent.close()
     parent_tb_conn.close()
 
     results = BenchmarkResult(None, [-1] * repeats, None, False)
     has_signature = False
-    for line in raw.decode().splitlines():
-        parts = line.strip().split('\t')
-        if len(parts) == 2 and parts[0].isdigit():
+    for line in response.splitlines():
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        parts = line.split('\t')
+        if len(parts) != 2:
+            raise RuntimeError(f"Invalid benchmark output: {line}")
+        if has_signature:
+            raise RuntimeError(f"Unexpected output after signature: {line}")
+
+        if parts[0].isdigit():
             iteration = int(parts[0])
             time_us = float(parts[1])
+            if results.time_us[iteration] != -1:
+                raise RuntimeError(f"Duplicate iteration {iteration} in benchmark output")
             results.time_us[iteration] = time_us
         elif parts[0] == "event-overhead":
             results.event_overhead_us = float(parts[1].split()[0])
         elif parts[0] == "error-count":
+            if results.errors is not None:
+                raise RuntimeError(f"Duplicate error count in benchmark output")
             results.errors = int(parts[1])
         elif parts[0] == "signature":
             if signature != parts[1]:
-                raise AssertionError(f"Invalid signature")
+                raise RuntimeError("Benchmark subprocess output failed authentication: invalid signature")
             has_signature = True
     if not has_signature:
         raise RuntimeError(f"No signature found in output")