Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion csrc/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
auto config = read_benchmark_parameters(input_fd);
BenchmarkManager mgr(result_fd, config.Signature, config.Seed, discard, nvtx, landlock);
auto [args, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr.do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
mgr.do_bench_py(kernel_qualname, args, std::move(expected), reinterpret_cast<cudaStream_t>(stream));
}


Expand Down
9 changes: 6 additions & 3 deletions csrc/check.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ __global__ void check_approx_match_kernel(unsigned* result, const Float* expecte
cuda::atomic_ref<unsigned, cuda::thread_scope_device> res(*result);
float a = static_cast<float>(expected[idx]);

// Nan is expected is wildcard for arbitrary results
if (isnan(a))
// NaN in the expected output must not behave as a wildcard.
// Treat it as a mismatch so malformed expectations fail closed.
if (isnan(a)) {
++res;
return;
}

#if __CUDA_ARCH__ >= 900
cudaGridDependencySynchronize();
Expand Down Expand Up @@ -180,4 +183,4 @@ void canaries(void* data, size_t size, unsigned seed, cudaStream_t stream) {
int grid_size = max_blocks * num_sms;
void *pArgs[] = { &data, &size, &seed};
CUDA_CHECK(cudaLaunchCooperativeKernel(&canaries_kernel, grid_size, block_size, pArgs, smem, stream));
}
}
15 changes: 12 additions & 3 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh
return *this;
}

void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream) {
void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, std::vector<nb::tuple> expected, cudaStream_t stream) {
if (args.size() < 5) {
throw std::runtime_error("Not enough test cases to run benchmark");
}
Expand Down Expand Up @@ -284,6 +284,11 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
}
}

// The benchmark loop only needs the unmanaged output copies after this point.
// Release Python-held expected tuples before importing untrusted code.
expected.clear();
expected.shrink_to_fit();

// clean up as much python state as we can
trigger_gc();

Expand All @@ -295,10 +300,14 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
// after this, we cannot trust python anymore
nb::callable kernel = kernel_from_qualname(kernel_qualname);

std::random_device warmup_rd;
std::mt19937 warmup_rng(warmup_rd());
std::uniform_int_distribution<int> warmup_dist(0, static_cast<int>(args.size()) - 1);

// ok, first run for compilations etc
nvtx_push("warmup");
CUDA_CHECK(cudaDeviceSynchronize());
kernel(*args.at(0));
kernel(*args.at(warmup_dist(warmup_rng)));
CUDA_CHECK(cudaDeviceSynchronize());
nvtx_pop();

Expand All @@ -312,7 +321,7 @@ void BenchmarkManager::do_bench_py(const std::string& kernel_qualname, const std
// this is only potentially problematic for in-place kernels;
CUDA_CHECK(cudaDeviceSynchronize());
clear_cache(stream);
kernel(*args.at(0));
kernel(*args.at(warmup_dist(warmup_rng)));
CUDA_CHECK(cudaDeviceSynchronize());
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = cpu_end - cpu_start;
Expand Down
2 changes: 1 addition & 1 deletion csrc/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class BenchmarkManager {
BenchmarkManager(int result_fd, std::string signature, std::uint64_t seed, bool discard, bool nvtx, bool landlock);
~BenchmarkManager();
std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
void do_bench_py(const std::string& kernel_qualname, const std::vector<nb::tuple>& args, std::vector<nb::tuple> expected, cudaStream_t stream);
private:
struct Expected {
enum EMode {
Expand Down