Skip to content

Commit 908e6af

Browse files
Josu San MartinJosu San Martin
authored andcommitted
Harden checker and warmup behavior
1 parent 283c9b6 commit 908e6af

File tree

4 files changed

+20
-8
lines changed

4 files changed

+20
-8
lines changed

csrc/binding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
1515
auto config = read_benchmark_parameters(input_fd);
1616
BenchmarkManager mgr(result_fd, config.Signature, config.Seed, discard, nvtx, landlock);
1717
auto [args, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
18-
mgr.do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
18+
mgr.do_bench_py(kernel_qualname, args, std::move(expected), reinterpret_cast<cudaStream_t>(stream));
1919
}
2020

2121

csrc/check.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,12 @@ __global__ void check_approx_match_kernel(unsigned* result, const Float* expecte
4141
cuda::atomic_ref<unsigned, cuda::thread_scope_device> res(*result);
4242
float a = static_cast<float>(expected[idx]);
4343

44-
// Nan is expected is wildcard for arbitrary results
45-
if (isnan(a))
44+
// NaN in the expected output must not behave as a wildcard.
45+
// Treat it as a mismatch so malformed expectations fail closed.
46+
if (isnan(a)) {
47+
++res;
4648
return;
49+
}
4750

4851
#if __CUDA_ARCH__ >= 900
4952
cudaGridDependencySynchronize();
@@ -180,4 +183,4 @@ void canaries(void* data, size_t size, unsigned seed, cudaStream_t stream) {
180183
int grid_size = max_blocks * num_sms;
181184
void *pArgs[] = { &data, &size, &seed};
182185
CUDA_CHECK(cudaLaunchCooperativeKernel(&canaries_kernel, grid_size, block_size, pArgs, smem, stream));
183-
}
186+
}

csrc/manager.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh
240240
return *this;
241241
}
242242

243-
void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream) {
243+
void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, std::vector<nb::tuple> expected, cudaStream_t stream) {
244244
if (args.size() < 5) {
245245
throw std::runtime_error("Not enough test cases to run benchmark");
246246
}
@@ -284,6 +284,11 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
284284
}
285285
}
286286

287+
// The benchmark loop only needs the unmanaged output copies after this point.
288+
// Release Python-held expected tuples before importing untrusted code.
289+
expected.clear();
290+
expected.shrink_to_fit();
291+
287292
// clean up as much python state as we can
288293
trigger_gc();
289294

@@ -295,10 +300,14 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
295300
// after this, we cannot trust python anymore
296301
nb::callable kernel = kernel_from_qualname(kernel_qualname);
297302

303+
std::random_device warmup_rd;
304+
std::mt19937 warmup_rng(warmup_rd());
305+
std::uniform_int_distribution<int> warmup_dist(0, static_cast<int>(args.size()) - 1);
306+
298307
// ok, first run for compilations etc
299308
nvtx_push("warmup");
300309
CUDA_CHECK(cudaDeviceSynchronize());
301-
kernel(*args.at(0));
310+
kernel(*args.at(warmup_dist(warmup_rng)));
302311
CUDA_CHECK(cudaDeviceSynchronize());
303312
nvtx_pop();
304313

@@ -312,7 +321,7 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
312321
// this is only potentially problematic for in-place kernels;
313322
CUDA_CHECK(cudaDeviceSynchronize());
314323
clear_cache(stream);
315-
kernel(*args.at(0));
324+
kernel(*args.at(warmup_dist(warmup_rng)));
316325
CUDA_CHECK(cudaDeviceSynchronize());
317326
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
318327
std::chrono::duration<double> elapsed_seconds = cpu_end - cpu_start;

csrc/manager.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class BenchmarkManager {
3131
BenchmarkManager(int result_fd, std::string signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock);
3232
~BenchmarkManager();
3333
std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
34-
void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
34+
void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, std::vector<nb::tuple> expected, cudaStream_t stream);
3535
private:
3636
struct Expected {
3737
enum EMode {

0 commit comments

Comments
 (0)