diff --git a/csrc/binding.cpp b/csrc/binding.cpp index c68abaa..ce13972 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -15,7 +15,7 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c auto config = read_benchmark_parameters(input_fd); BenchmarkManager mgr(result_fd, config.Signature, config.Seed, discard, nvtx, landlock); auto [args, expected] = mgr.setup_benchmark(nb::cast(test_generator), test_kwargs, config.Repeats); - mgr.do_bench_py(kernel_qualname, args, expected, reinterpret_cast(stream)); + mgr.do_bench_py(kernel_qualname, args, std::move(expected), reinterpret_cast(stream)); } diff --git a/csrc/check.cu b/csrc/check.cu index e11099f..a6b648b 100644 --- a/csrc/check.cu +++ b/csrc/check.cu @@ -41,9 +41,12 @@ __global__ void check_approx_match_kernel(unsigned* result, const Float* expecte cuda::atomic_ref res(*result); float a = static_cast(expected[idx]); - // Nan is expected is wildcard for arbitrary results - if (isnan(a)) + // NaN in the expected output must not behave as a wildcard. + // Treat it as a mismatch so malformed expectations fail closed. + if (isnan(a)) { + ++res; return; + } #if __CUDA_ARCH__ >= 900 cudaGridDependencySynchronize(); @@ -180,4 +183,4 @@ void canaries(void* data, size_t size, unsigned seed, cudaStream_t stream) { int grid_size = max_blocks * num_sms; void *pArgs[] = { &data, &size, &seed}; CUDA_CHECK(cudaLaunchCooperativeKernel(&canaries_kernel, grid_size, block_size, pArgs, smem, stream)); -} \ No newline at end of file +} diff --git a/csrc/manager.cpp b/csrc/manager.cpp index 6540eb3..60982e0 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -240,7 +240,7 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh return *this; } -void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std::vector& args, const std::vector& expected, cudaStream_t stream) { +void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std::vector& args, std::vector expected, cudaStream_t stream) { if (args.size() < 5) { throw std::runtime_error("Not enough test cases to run benchmark"); } @@ -284,6 +284,11 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std } } + // The benchmark loop only needs the unmanaged output copies after this point. + // Release Python-held expected tuples before importing untrusted code. + expected.clear(); + expected.shrink_to_fit(); + // clean up as much python state as we can trigger_gc(); @@ -295,10 +300,14 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std // after this, we cannot trust python anymore nb::callable kernel = kernel_from_qualname(kernel_qualname); + std::random_device warmup_rd; + std::mt19937 warmup_rng(warmup_rd()); + std::uniform_int_distribution warmup_dist(0, static_cast(args.size()) - 1); + // ok, first run for compilations etc nvtx_push("warmup"); CUDA_CHECK(cudaDeviceSynchronize()); - kernel(*args.at(0)); + kernel(*args.at(warmup_dist(warmup_rng))); CUDA_CHECK(cudaDeviceSynchronize()); nvtx_pop(); @@ -312,7 +321,7 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std // this is only potentially problematic for in-place kernels; CUDA_CHECK(cudaDeviceSynchronize()); clear_cache(stream); - kernel(*args.at(0)); + kernel(*args.at(warmup_dist(warmup_rng))); CUDA_CHECK(cudaDeviceSynchronize()); std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed_seconds = cpu_end - cpu_start; diff --git a/csrc/manager.h b/csrc/manager.h index 1e087d2..d98aeb6 100644 --- a/csrc/manager.h +++ b/csrc/manager.h @@ -31,7 +31,7 @@ class BenchmarkManager { BenchmarkManager(int result_fd, std::string signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock); ~BenchmarkManager(); std::pair, std::vector> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats); - void do_bench_py(const std::string& kernel_qualname, const std::vector& args, const std::vector& expected, cudaStream_t stream); + void do_bench_py(const std::string& kernel_qualname, const std::vector& args, std::vector expected, cudaStream_t stream); private: struct Expected { enum EMode {