Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,34 @@ try to exploit benchmarking flaws to receive higher scores.
To benchmark a kernel, two ingredients are needed:
1. A function that _generates_ the kernel. This function takes no arguments and returns a callable. It is important that
untrusted code, e.g., the user-supplied python module, is only imported inside this function.
2. A function that generates test/benchmark inputs. This function takes a tuple of configuration parameters, as well as an
integer to seed the rng, as arguments. It returns two tuples: The first contains the inputs for the kernel and will
be used to call the kernel function, and the second contains the expected output and the required absolute and relative tolerance.
2. A function that generates test/benchmark inputs. This function takes configuration parameters and
an integer seed for rng as arguments. It returns three tuples:
- inputs: kernel inputs
- outputs: output tensors passed to the kernel
- expected: one expected spec per output tensor

```python
import torch
import pygpubench

def generate_input(*args):
def generate_input(size, seed):
...

def reference_kernel(args):
...

def generate_test_case(args, seed):
x, y = generate_input(*args, seed)
def generate_test_case(size, seed):
x, y = generate_input(size, seed)
expected = torch.empty_like(y)
reference_kernel((expected, x))
return (y, x), (expected, 1e-6, 1e-6)
return (x,), (y,), ((expected, 1e-6, 1e-6),)


def kernel_generator():
import submission
return submission.kernel

res = pygpubench.do_bench_isolated(kernel_generator, generate_test_case, (1024,), 100, 5, discard=True)
res = pygpubench.do_bench_isolated(kernel_generator, generate_test_case, {"size": 1024}, 100, 5, discard=True)
print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us))
```
For the full example see [grayscale.py](test/grayscale.py)
Expand Down
4 changes: 2 additions & 2 deletions csrc/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ namespace nb = nanobind;

void do_bench(std::string target_file, const nb::object& kernel_generator, const nb::object& test_generator, const nb::dict& test_kwargs, int repeats, std::uint64_t seed, std::uintptr_t stream, bool discard, bool unlink, bool nvtx) {
BenchmarkManager mgr(std::move(target_file), seed, discard, unlink, nvtx);
auto [args, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, repeats);
mgr.do_bench_py(nb::cast<nb::callable>(kernel_generator), args, expected, reinterpret_cast<cudaStream_t>(stream));
auto [args, outputs, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, repeats);
mgr.do_bench_py(nb::cast<nb::callable>(kernel_generator), args, outputs, expected, reinterpret_cast<cudaStream_t>(stream));
}


Expand Down
127 changes: 93 additions & 34 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,17 @@ BenchmarkManager::~BenchmarkManager() {
cudaFree(mDeviceErrorCounter);
for (auto& event : mStartEvents) cudaEventDestroy(event);
for (auto& event : mEndEvents) cudaEventDestroy(event);
for (auto& exp: mExpectedOutputs) cudaFree(exp.Value);
for (auto& expected_per_test : mExpectedOutputs) {
for (auto& exp : expected_per_test) cudaFree(exp.Value);
}
}

std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
std::tuple<std::vector<nb::tuple>, std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
std::mt19937_64 rng(mSeed);
std::uniform_int_distribution<std::uint64_t> dist(0, std::numeric_limits<std::uint64_t>::max());
// generate one more input to handle warmup
std::vector<nb::tuple> kernel_args(repeats + 1);
std::vector<nb::tuple> call_args(repeats + 1);
std::vector<nb::tuple> outputs(repeats + 1);
std::vector<nb::tuple> expected(repeats + 1);
for (int i = 0; i < repeats + 1; i++) {
// create new copy of the kwargs dict
Expand All @@ -74,23 +77,41 @@ std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setu
call_kwargs["seed"] = dist(rng);

auto gen = nb::cast<nb::tuple>(generate_test_case(**call_kwargs));
kernel_args[i] = nb::cast<nb::tuple>(gen[0]);
expected[i] = nb::cast<nb::tuple>(gen[1]);
if (gen.size() != 3) {
throw std::runtime_error("generate_test_case must return a 3-tuple: (inputs, outputs, expected)");
}

nb::tuple inputs = nb::cast<nb::tuple>(gen[0]);
outputs[i] = nb::cast<nb::tuple>(gen[1]);
expected[i] = nb::cast<nb::tuple>(gen[2]);

if (outputs[i].size() == 0) {
throw std::runtime_error("outputs tuple must not be empty");
}
if (expected[i].size() != outputs[i].size()) {
throw std::runtime_error("expected tuple size must match outputs tuple size");
}

PyObject* combined = PySequence_Concat(outputs[i].ptr(), inputs.ptr());
if (combined == nullptr) {
throw nb::python_error();
}
call_args[i] = nb::steal<nb::tuple>(combined);
}
return std::make_pair(std::move(kernel_args), std::move(expected));
return std::make_tuple(std::move(call_args), std::move(outputs), std::move(expected));
}

bool can_convert_to_tensor(nb::handle obj) {
return nb::isinstance<nb_cuda_array>(obj);
}

auto BenchmarkManager::make_shadow_args(const nb::tuple& args, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
auto BenchmarkManager::make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
std::vector<std::optional<ShadowArgument>> shadow_args(args.size());
int nargs = args.size();
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned> canary_seed_dist(0, 0xffffffff);
for (int i = 1; i < nargs; i++) {
for (std::size_t i = first_input_idx; i < static_cast<std::size_t>(nargs); i++) {
if (can_convert_to_tensor(args[i])) {
nb_cuda_array arr = nb::cast<nb_cuda_array>(args[i]);
void* shadow;
Expand Down Expand Up @@ -131,6 +152,39 @@ void BenchmarkManager::validate_result(Expected& expected, const nb_cuda_array&
}
}

BenchmarkManager::Expected BenchmarkManager::parse_expected_spec(const nb::handle& obj) {
nb_cuda_array expected_array;
auto mode = BenchmarkManager::Expected::ExactMatch;
float rtol = 0.f;
float atol = 0.f;

if (nb::isinstance<nb_cuda_array>(obj)) {
expected_array = nb::cast<nb_cuda_array>(obj);
} else {
nb::tuple expected_tuple = nb::cast<nb::tuple>(obj);
if (expected_tuple.size() == 0) {
throw std::runtime_error("Expected spec tuple must not be empty");
}
if (expected_tuple.size() != 1 && expected_tuple.size() != 3) {
throw std::runtime_error("Expected spec tuple must have size 1 or 3");
}
expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);
if (expected_tuple.size() == 3) {
rtol = nb::cast<float>(expected_tuple[1]);
atol = nb::cast<float>(expected_tuple[2]);
mode = BenchmarkManager::Expected::ApproxMatch;
}
}

// copy expected values into memory not owned by torch, then wipe original
void* copy_mem;
CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));

return {mode, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
}

void BenchmarkManager::clear_cache(cudaStream_t stream) {
::clear_cache(mDeviceDummyMemory, 2 * mL2CacheSize, mDiscardCache, stream);
}
Expand All @@ -155,47 +209,50 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh
return *this;
}

void BenchmarkManager::do_bench_py(const nb::callable& kernel_generator, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream) {
void BenchmarkManager::do_bench_py(
const nb::callable& kernel_generator,
const std::vector<nb::tuple>& args,
const std::vector<nb::tuple>& output_tuples,
const std::vector<nb::tuple>& expected,
cudaStream_t stream
) {
if (args.size() < 5) {
throw std::runtime_error("Not enough test cases to run benchmark");
}
if (expected.size() != args.size()) {
throw std::runtime_error("Expected results and test case list do not have the same length");
if (output_tuples.size() != args.size() || expected.size() != args.size()) {
throw std::runtime_error("Expected results, outputs, and test case lists do not have the same length");
}
int calls = args.size() - 1;

// extract relevant infos from args and expected
// by convention, the first arg is the output tensor.
// TODO handle multiple outputs
std::vector<nb_cuda_array> outputs(args.size());
// extract relevant infos from outputs and expected
std::vector<std::vector<nb_cuda_array>> outputs(args.size());
for (int i = 0; i < args.size(); i++) {
outputs.at(i) = nb::cast<nb_cuda_array>(args.at(i)[0]);
const nb::tuple& output_tuple = output_tuples.at(i);
outputs.at(i).reserve(output_tuple.size());
for (int j = 0; j < output_tuple.size(); j++) {
outputs.at(i).push_back(nb::cast<nb_cuda_array>(output_tuple[j]));
}
}

// Generate "shadow" copies of input arguments
std::vector<ShadowArgumentList> shadow_arguments;
for (const auto & arg : args) {
shadow_arguments.emplace_back(make_shadow_args(arg, stream));
for (int i = 0; i < args.size(); i++) {
shadow_arguments.emplace_back(make_shadow_args(args.at(i), outputs.at(i).size(), stream));
}

for (auto& expected_per_test : mExpectedOutputs) {
for (auto& exp : expected_per_test) cudaFree(exp.Value);
}
mExpectedOutputs.clear();
mExpectedOutputs.resize(args.size());
for (int i = 0; i < args.size(); i++) {
const nb::tuple& expected_tuple = expected.at(i);
nb_cuda_array expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);

// make a copy of the expected result and put it in memory not owned by torch; overwrite the original
// so it cannot be read by cheating solutions.
void* copy_mem;
CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));

if (expected.at(i).size() == 1) {
mExpectedOutputs.at(i) = {Expected::ExactMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), 0.f, 0.f};
} else {
float rtol = nb::cast<float>(expected_tuple[1]);
float atol = nb::cast<float>(expected_tuple[2]);
mExpectedOutputs.at(i) = {Expected::ApproxMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
if (expected_tuple.size() != outputs.at(i).size()) {
throw std::runtime_error("Expected tuple size must match outputs tuple size");
}
mExpectedOutputs.at(i).reserve(expected_tuple.size());
for (int j = 0; j < expected_tuple.size(); j++) {
mExpectedOutputs.at(i).push_back(parse_expected_spec(expected_tuple[j]));
}
}

Expand Down Expand Up @@ -324,7 +381,9 @@ void BenchmarkManager::do_bench_py(const nb::callable& kernel_generator, const s
CUDA_CHECK(cudaEventRecord(mEndEvents.at(i), stream));
// immediately after the kernel, launch the checking code; if there is some unsynced work done on another stream,
// this increases the chance of detection.
validate_result(mExpectedOutputs.at(test_id), outputs.at(test_id), check_seed_generator(rng), stream);
for (std::size_t j = 0; j < outputs.at(test_id).size(); j++) {
validate_result(mExpectedOutputs.at(test_id).at(j), outputs.at(test_id).at(j), check_seed_generator(rng), stream);
}
}
nvtx_pop();

Expand Down
47 changes: 27 additions & 20 deletions csrc/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <functional>
#include <chrono>
#include <fstream>
#include <tuple>
#include <cuda_runtime.h>
#include <optional>
#include <nanobind/nanobind.h>
Expand All @@ -21,9 +22,30 @@ class BenchmarkManager {
public:
BenchmarkManager(std::string result_file, std::uint64_t seed, bool discard, bool unlink, bool nvtx);
~BenchmarkManager();
std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
void do_bench_py(const nb::callable& kernel_generator, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
std::tuple<std::vector<nb::tuple>, std::vector<nb::tuple>, std::vector<nb::tuple>>
setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
void do_bench_py(
const nb::callable& kernel_generator,
const std::vector<nb::tuple>& args,
const std::vector<nb::tuple>& outputs,
const std::vector<nb::tuple>& expected,
cudaStream_t stream
);
private:
double mWarmupSeconds = 1.0;
double mBenchmarkSeconds = 1.0;

std::vector<cudaEvent_t> mStartEvents;
std::vector<cudaEvent_t> mEndEvents;

std::chrono::high_resolution_clock::time_point mCPUStart;

int* mDeviceDummyMemory = nullptr;
int mL2CacheSize;
unsigned* mDeviceErrorCounter = nullptr;
bool mNVTXEnabled = false;
bool mDiscardCache = true;
std::uint64_t mSeed = -1;
struct Expected {
enum EMode {
ExactMatch,
Expand All @@ -48,29 +70,14 @@ class BenchmarkManager {

using ShadowArgumentList = std::vector<std::optional<ShadowArgument>>;

double mWarmupSeconds = 1.0;
double mBenchmarkSeconds = 1.0;

std::vector<cudaEvent_t> mStartEvents;
std::vector<cudaEvent_t> mEndEvents;

std::chrono::high_resolution_clock::time_point mCPUStart;

int* mDeviceDummyMemory = nullptr;
int mL2CacheSize;
unsigned* mDeviceErrorCounter = nullptr;
bool mNVTXEnabled = false;
bool mDiscardCache = true;
std::uint64_t mSeed = -1;
std::vector<Expected> mExpectedOutputs;

std::vector<std::vector<Expected>> mExpectedOutputs;
std::ofstream mOutputFile;

static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream);
static ShadowArgumentList make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream);
static Expected parse_expected_spec(const nb::handle& obj);

void nvtx_push(const char* name);
void nvtx_pop();

void validate_result(Expected& expected, const nb_cuda_array& result, unsigned seed, cudaStream_t stream);
void clear_cache(cudaStream_t stream);
};
Expand Down
8 changes: 5 additions & 3 deletions exploits/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ def generate_test_case(size, seed):
gen.manual_seed(seed)

x = torch.rand(size, size, 3, device="cuda", dtype=torch.float32, generator=gen).contiguous()
y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
y_gray = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
y_red = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()

weights = torch.tensor([0.2989, 0.5870, 0.1140], device="cuda", dtype=torch.float32)
expected = torch.sum(x * weights, dim=-1).contiguous()
expected_gray = torch.sum(x * weights, dim=-1).contiguous()
expected_red = x[..., 0].contiguous()

return (y, x), (expected, 1e-6, 1e-6)
return (x,), (y_gray, y_red), ((expected_gray, 1e-6, 1e-6), expected_red)


def kernel_generator(module_name):
Expand Down
2 changes: 1 addition & 1 deletion exploits/submission_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
_in_benchmark = False


def kernel(output, inp):
def kernel(output_gray, output_red, inp):
"""
During warmup: sleep to inflate time_estimate (fewer benchmark iterations).
During benchmark: no-op (correctness bypassed by NaN injection).
Expand Down
5 changes: 3 additions & 2 deletions exploits/submission_correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@

_weights = torch.tensor([0.2989, 0.5870, 0.1140], device="cuda", dtype=torch.float32)

def kernel(output, inp):
torch.sum(inp * _weights, dim=-1, out=output)
def kernel(output_gray, output_red, inp):
torch.sum(inp * _weights, dim=-1, out=output_gray)
output_red.copy_(inp[..., 0])
7 changes: 4 additions & 3 deletions exploits/submission_error_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,12 @@ def _zero_loop():
_cudart.cudaMemset(ctypes.c_void_p(_counter_ptr), 0, 4)


def kernel(output, inp):
"""Wrong kernel that zeros the output."""
def kernel(output_gray, output_red, inp):
"""Wrong kernel that zeros the output. Relies on error counter being zeroed."""
global _zero_thread, _keep_zeroing

output.zero_()
output_gray.zero_()
output_red.zero_()

# Start persistent zeroing thread on first kernel call
if _zero_thread is None or not _zero_thread.is_alive():
Expand Down
5 changes: 3 additions & 2 deletions exploits/submission_fd_overwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def _overwrite_results():
print("[FD Overwrite] Registered atexit handler", file=sys.stderr)


def kernel(output, inp):
def kernel(output_gray, output_red, inp):
"""Wrong kernel (zeros output). Results will be overwritten by atexit handler."""
output.zero_()
output_gray.zero_()
output_red.zero_()
Loading