diff --git a/README.md b/README.md index 9053da8..19aad35 100644 --- a/README.md +++ b/README.md @@ -11,32 +11,34 @@ try to exploit benchmarking flaws to receive higher scores. To benchmark a kernel, two ingredients are needed: 1. A function that _generates_ the kernel. This function takes no arguments and returns a callable. It is important that untrusted code, e.g., the user-supplied python module, is only imported inside this function. -2. A function that generates test/benchmark inputs. This function takes a tuple of configuration parameters, as well as an - integer to seed the rng, as arguments. It returns two tuples: The first contains the inputs for the kernel and will - be used to call the kernel function, and the second contains the expected output and the required absolute and relative tolerance. +2. A function that generates test/benchmark inputs. This function takes configuration parameters and + an integer seed for rng as arguments. It returns three tuples: + - inputs: kernel inputs + - outputs: output tensors passed to the kernel + - expected: one expected spec per output tensor ```python import torch import pygpubench -def generate_input(*args): +def generate_input(size, seed): ... def reference_kernel(args): ... -def generate_test_case(args, seed): - x, y = generate_input(*args, seed) +def generate_test_case(size, seed): + x, y = generate_input(size, seed) expected = torch.empty_like(y) reference_kernel((expected, x)) - return (y, x), (expected, 1e-6, 1e-6) + return (x,), (y,), ((expected, 1e-6, 1e-6),) def kernel_generator(): import submission return submission.kernel -res = pygpubench.do_bench_isolated(kernel_generator, generate_test_case, (1024,), 100, 5, discard=True) +res = pygpubench.do_bench_isolated(kernel_generator, generate_test_case, {"size": 1024}, 100, 5, discard=True) print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us)) ``` For the full example see [grayscale.py](test/grayscale.py) diff --git a/csrc/binding.cpp b/csrc/binding.cpp index 9b0abde..a994a35 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -13,8 +13,8 @@ namespace nb = nanobind; void do_bench(std::string target_file, const nb::object& kernel_generator, const nb::object& test_generator, const nb::dict& test_kwargs, int repeats, std::uint64_t seed, std::uintptr_t stream, bool discard, bool unlink, bool nvtx) { BenchmarkManager mgr(std::move(target_file), seed, discard, unlink, nvtx); - auto [args, expected] = mgr.setup_benchmark(nb::cast(test_generator), test_kwargs, repeats); - mgr.do_bench_py(nb::cast(kernel_generator), args, expected, reinterpret_cast(stream)); + auto [args, outputs, expected] = mgr.setup_benchmark(nb::cast(test_generator), test_kwargs, repeats); + mgr.do_bench_py(nb::cast(kernel_generator), args, outputs, expected, reinterpret_cast(stream)); } diff --git a/csrc/manager.cpp b/csrc/manager.cpp index 326d77c..6d96676 100644 --- a/csrc/manager.cpp +++ b/csrc/manager.cpp @@ -52,14 +52,17 @@ BenchmarkManager::~BenchmarkManager() { cudaFree(mDeviceErrorCounter); for (auto& event : mStartEvents) cudaEventDestroy(event); for (auto& event : mEndEvents) cudaEventDestroy(event); - for (auto& exp: mExpectedOutputs) cudaFree(exp.Value); + for (auto& expected_per_test : mExpectedOutputs) { + for (auto& exp : expected_per_test) cudaFree(exp.Value); + } } -std::pair, std::vector> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) { +std::tuple, std::vector, std::vector> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) { std::mt19937_64 rng(mSeed); std::uniform_int_distribution dist(0, std::numeric_limits::max()); // generate one more input to handle warmup - std::vector kernel_args(repeats + 1); + std::vector call_args(repeats + 1); + std::vector outputs(repeats + 1); std::vector expected(repeats + 1); for (int i = 0; i < repeats + 1; i++) { // create new copy of the kwargs dict @@ -74,23 +77,41 @@ std::pair, std::vector> BenchmarkManager::setu call_kwargs["seed"] = dist(rng); auto gen = nb::cast(generate_test_case(**call_kwargs)); - kernel_args[i] = nb::cast(gen[0]); - expected[i] = nb::cast(gen[1]); + if (gen.size() != 3) { + throw std::runtime_error("generate_test_case must return a 3-tuple: (inputs, outputs, expected)"); + } + + nb::tuple inputs = nb::cast(gen[0]); + outputs[i] = nb::cast(gen[1]); + expected[i] = nb::cast(gen[2]); + + if (outputs[i].size() == 0) { + throw std::runtime_error("outputs tuple must not be empty"); + } + if (expected[i].size() != outputs[i].size()) { + throw std::runtime_error("expected tuple size must match outputs tuple size"); + } + + PyObject* combined = PySequence_Concat(outputs[i].ptr(), inputs.ptr()); + if (combined == nullptr) { + throw nb::python_error(); + } + call_args[i] = nb::steal(combined); } - return std::make_pair(std::move(kernel_args), std::move(expected)); + return std::make_tuple(std::move(call_args), std::move(outputs), std::move(expected)); } bool can_convert_to_tensor(nb::handle obj) { return nb::isinstance(obj); } -auto BenchmarkManager::make_shadow_args(const nb::tuple& args, cudaStream_t stream) -> std::vector> { +auto BenchmarkManager::make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream) -> std::vector> { std::vector> shadow_args(args.size()); int nargs = args.size(); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution canary_seed_dist(0, 0xffffffff); - for (int i = 1; i < nargs; i++) { + for (std::size_t i = first_input_idx; i < static_cast(nargs); i++) { if (can_convert_to_tensor(args[i])) { nb_cuda_array arr = nb::cast(args[i]); void* shadow; @@ -131,6 +152,39 @@ void BenchmarkManager::validate_result(Expected& expected, const nb_cuda_array& } } +BenchmarkManager::Expected BenchmarkManager::parse_expected_spec(const nb::handle& obj) { + nb_cuda_array expected_array; + auto mode = BenchmarkManager::Expected::ExactMatch; + float rtol = 0.f; + float atol = 0.f; + + if (nb::isinstance(obj)) { + expected_array = nb::cast(obj); + } else { + nb::tuple expected_tuple = nb::cast(obj); + if (expected_tuple.size() == 0) { + throw std::runtime_error("Expected spec tuple must not be empty"); + } + if (expected_tuple.size() != 1 && expected_tuple.size() != 3) { + throw std::runtime_error("Expected spec tuple must have size 1 or 3"); + } + expected_array = nb::cast(expected_tuple[0]); + if (expected_tuple.size() == 3) { + rtol = nb::cast(expected_tuple[1]); + atol = nb::cast(expected_tuple[2]); + mode = BenchmarkManager::Expected::ApproxMatch; + } + } + + // copy expected values into memory not owned by torch, then wipe original + void* copy_mem; + CUDA_CHECK(cudaMalloc(©_mem, expected_array.nbytes())); + CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes())); + + return {mode, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol}; +} + void BenchmarkManager::clear_cache(cudaStream_t stream) { ::clear_cache(mDeviceDummyMemory, 2 * mL2CacheSize, mDiscardCache, stream); } @@ -155,47 +209,50 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh return *this; } -void BenchmarkManager::do_bench_py(const nb::callable& kernel_generator, const std::vector& args, const std::vector& expected, cudaStream_t stream) { +void BenchmarkManager::do_bench_py( + const nb::callable& kernel_generator, + const std::vector& args, + const std::vector& output_tuples, + const std::vector& expected, + cudaStream_t stream +) { if (args.size() < 5) { throw std::runtime_error("Not enough test cases to run benchmark"); } - if (expected.size() != args.size()) { - throw std::runtime_error("Expected results and test case list do not have the same length"); + if (output_tuples.size() != args.size() || expected.size() != args.size()) { + throw std::runtime_error("Expected results, outputs, and test case lists do not have the same length"); } int calls = args.size() - 1; - // extract relevant infos from args and expected - // by convention, the first arg is the output tensor. - // TODO handle multiple outputs - std::vector outputs(args.size()); + // extract relevant infos from outputs and expected + std::vector> outputs(args.size()); for (int i = 0; i < args.size(); i++) { - outputs.at(i) = nb::cast(args.at(i)[0]); + const nb::tuple& output_tuple = output_tuples.at(i); + outputs.at(i).reserve(output_tuple.size()); + for (int j = 0; j < output_tuple.size(); j++) { + outputs.at(i).push_back(nb::cast(output_tuple[j])); + } } // Generate "shadow" copies of input arguments std::vector shadow_arguments; - for (const auto & arg : args) { - shadow_arguments.emplace_back(make_shadow_args(arg, stream)); + for (int i = 0; i < args.size(); i++) { + shadow_arguments.emplace_back(make_shadow_args(args.at(i), outputs.at(i).size(), stream)); } + for (auto& expected_per_test : mExpectedOutputs) { + for (auto& exp : expected_per_test) cudaFree(exp.Value); + } + mExpectedOutputs.clear(); mExpectedOutputs.resize(args.size()); for (int i = 0; i < args.size(); i++) { const nb::tuple& expected_tuple = expected.at(i); - nb_cuda_array expected_array = nb::cast(expected_tuple[0]); - - // make a copy of the expected result and put it in memory not owned by torch; overwrite the original - // so it cannot be read by cheating solutions. - void* copy_mem; - CUDA_CHECK(cudaMalloc(©_mem, expected_array.nbytes())); - CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes())); - - if (expected.at(i).size() == 1) { - mExpectedOutputs.at(i) = {Expected::ExactMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), 0.f, 0.f}; - } else { - float rtol = nb::cast(expected_tuple[1]); - float atol = nb::cast(expected_tuple[2]); - mExpectedOutputs.at(i) = {Expected::ApproxMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol}; + if (expected_tuple.size() != outputs.at(i).size()) { + throw std::runtime_error("Expected tuple size must match outputs tuple size"); + } + mExpectedOutputs.at(i).reserve(expected_tuple.size()); + for (int j = 0; j < expected_tuple.size(); j++) { + mExpectedOutputs.at(i).push_back(parse_expected_spec(expected_tuple[j])); } } @@ -324,7 +381,9 @@ void BenchmarkManager::do_bench_py(const nb::callable& kernel_generator, const s CUDA_CHECK(cudaEventRecord(mEndEvents.at(i), stream)); // immediately after the kernel, launch the checking code; if there is some unsynced work done on another stream, // this increases the chance of detection. - validate_result(mExpectedOutputs.at(test_id), outputs.at(test_id), check_seed_generator(rng), stream); + for (std::size_t j = 0; j < outputs.at(test_id).size(); j++) { + validate_result(mExpectedOutputs.at(test_id).at(j), outputs.at(test_id).at(j), check_seed_generator(rng), stream); + } } nvtx_pop(); diff --git a/csrc/manager.h b/csrc/manager.h index 64400f5..88dc815 100644 --- a/csrc/manager.h +++ b/csrc/manager.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -21,9 +22,30 @@ class BenchmarkManager { public: BenchmarkManager(std::string result_file, std::uint64_t seed, bool discard, bool unlink, bool nvtx); ~BenchmarkManager(); - std::pair, std::vector> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats); - void do_bench_py(const nb::callable& kernel_generator, const std::vector& args, const std::vector& expected, cudaStream_t stream); + std::tuple, std::vector, std::vector> + setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats); + void do_bench_py( + const nb::callable& kernel_generator, + const std::vector& args, + const std::vector& outputs, + const std::vector& expected, + cudaStream_t stream + ); private: + double mWarmupSeconds = 1.0; + double mBenchmarkSeconds = 1.0; + + std::vector mStartEvents; + std::vector mEndEvents; + + std::chrono::high_resolution_clock::time_point mCPUStart; + + int* mDeviceDummyMemory = nullptr; + int mL2CacheSize; + unsigned* mDeviceErrorCounter = nullptr; + bool mNVTXEnabled = false; + bool mDiscardCache = true; + std::uint64_t mSeed = -1; struct Expected { enum EMode { ExactMatch, @@ -48,29 +70,14 @@ class BenchmarkManager { using ShadowArgumentList = std::vector>; - double mWarmupSeconds = 1.0; - double mBenchmarkSeconds = 1.0; - - std::vector mStartEvents; - std::vector mEndEvents; - - std::chrono::high_resolution_clock::time_point mCPUStart; - - int* mDeviceDummyMemory = nullptr; - int mL2CacheSize; - unsigned* mDeviceErrorCounter = nullptr; - bool mNVTXEnabled = false; - bool mDiscardCache = true; - std::uint64_t mSeed = -1; - std::vector mExpectedOutputs; - + std::vector> mExpectedOutputs; std::ofstream mOutputFile; - static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream); + static ShadowArgumentList make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream); + static Expected parse_expected_spec(const nb::handle& obj); void nvtx_push(const char* name); void nvtx_pop(); - void validate_result(Expected& expected, const nb_cuda_array& result, unsigned seed, cudaStream_t stream); void clear_cache(cudaStream_t stream); }; diff --git a/exploits/benchmark.py b/exploits/benchmark.py index 1e16823..9cd0987 100644 --- a/exploits/benchmark.py +++ b/exploits/benchmark.py @@ -12,12 +12,14 @@ def generate_test_case(size, seed): gen.manual_seed(seed) x = torch.rand(size, size, 3, device="cuda", dtype=torch.float32, generator=gen).contiguous() - y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous() + y_gray = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous() + y_red = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous() weights = torch.tensor([0.2989, 0.5870, 0.1140], device="cuda", dtype=torch.float32) - expected = torch.sum(x * weights, dim=-1).contiguous() + expected_gray = torch.sum(x * weights, dim=-1).contiguous() + expected_red = x[..., 0].contiguous() - return (y, x), (expected, 1e-6, 1e-6) + return (x,), (y_gray, y_red), ((expected_gray, 1e-6, 1e-6), expected_red) def kernel_generator(module_name): diff --git a/exploits/submission_combined.py b/exploits/submission_combined.py index 2a4a905..9115e62 100644 --- a/exploits/submission_combined.py +++ b/exploits/submission_combined.py @@ -41,7 +41,7 @@ _in_benchmark = False -def kernel(output, inp): +def kernel(output_gray, output_red, inp): """ During warmup: sleep to inflate time_estimate (fewer benchmark iterations). During benchmark: no-op (correctness bypassed by NaN injection). diff --git a/exploits/submission_correct.py b/exploits/submission_correct.py index 677e14c..24babde 100644 --- a/exploits/submission_correct.py +++ b/exploits/submission_correct.py @@ -5,5 +5,6 @@ _weights = torch.tensor([0.2989, 0.5870, 0.1140], device="cuda", dtype=torch.float32) -def kernel(output, inp): - torch.sum(inp * _weights, dim=-1, out=output) +def kernel(output_gray, output_red, inp): + torch.sum(inp * _weights, dim=-1, out=output_gray) + output_red.copy_(inp[..., 0]) diff --git a/exploits/submission_error_counter.py b/exploits/submission_error_counter.py index 53f06bf..932fd55 100644 --- a/exploits/submission_error_counter.py +++ b/exploits/submission_error_counter.py @@ -106,11 +106,12 @@ def _zero_loop(): _cudart.cudaMemset(ctypes.c_void_p(_counter_ptr), 0, 4) -def kernel(output, inp): - """Wrong kernel that zeros the output.""" +def kernel(output_gray, output_red, inp): + """Wrong kernel that zeros the output. Relies on error counter being zeroed.""" global _zero_thread, _keep_zeroing - output.zero_() + output_gray.zero_() + output_red.zero_() # Start persistent zeroing thread on first kernel call if _zero_thread is None or not _zero_thread.is_alive(): diff --git a/exploits/submission_fd_overwrite.py b/exploits/submission_fd_overwrite.py index 67558d1..51afc43 100644 --- a/exploits/submission_fd_overwrite.py +++ b/exploits/submission_fd_overwrite.py @@ -90,6 +90,7 @@ def _overwrite_results(): print("[FD Overwrite] Registered atexit handler", file=sys.stderr) -def kernel(output, inp): +def kernel(output_gray, output_red, inp): """Wrong kernel (zeros output). Results will be overwritten by atexit handler.""" - output.zero_() + output_gray.zero_() + output_red.zero_() diff --git a/exploits/submission_gc_copy.py b/exploits/submission_gc_copy.py index cc574cd..f893db9 100644 --- a/exploits/submission_gc_copy.py +++ b/exploits/submission_gc_copy.py @@ -4,7 +4,7 @@ Severity: CRITICAL Strategy: - At import time, use gc to find both kernel arg tuples (output, input) and + At import time, use gc to find both kernel arg tuples (output0, output1, input) and expected result tuples (expected_tensor, rtol, atol). Build a mapping from output tensor data_ptr -> expected tensor. In the kernel, just copy the expected answer to the output. @@ -28,33 +28,34 @@ expected_tuples = [] for obj in gc.get_objects(): - if isinstance(obj, tuple): - if len(obj) == 2: - try: - a, b = obj - if (isinstance(a, torch.Tensor) and a.is_cuda - and isinstance(b, torch.Tensor) and b.is_cuda - and a.ndim == 2 and b.ndim == 3): # output is 2D, input is 3D (grayscale) - kernel_arg_tuples.append(obj) - except Exception: - pass - elif len(obj) == 3: - try: - a, b, c = obj - if (isinstance(a, torch.Tensor) and a.is_cuda - and isinstance(b, float) and isinstance(c, float) - and 0 < b < 1 and 0 < c < 1): - expected_tuples.append(obj) - except Exception: - pass + if isinstance(obj, tuple) and len(obj) == 3: + try: + a, b, c = obj + if (isinstance(a, torch.Tensor) and a.is_cuda + and isinstance(b, torch.Tensor) and b.is_cuda + and isinstance(c, torch.Tensor) and c.is_cuda + and a.ndim == 2 and b.ndim == 2 and c.ndim == 3): + kernel_arg_tuples.append(obj) + except Exception: + pass + try: + a, b, c = obj + if (isinstance(a, torch.Tensor) and a.is_cuda + and isinstance(b, float) and isinstance(c, float) + and 0 < b < 1 and 0 < c < 1): + expected_tuples.append(obj) + except Exception: + pass print(f"[GC Copy] Found {len(kernel_arg_tuples)} kernel arg tuples, {len(expected_tuples)} expected tuples", file=sys.stderr) - # Match by allocation order (heuristic) - for i in range(min(len(kernel_arg_tuples), len(expected_tuples))): - output_tensor = kernel_arg_tuples[i][0] - expected_tensor = expected_tuples[i][0] - _answer_map[output_tensor.data_ptr()] = expected_tensor + # Match by allocation order (heuristic) and flatten both outputs. + output_tensors = [] + for output0, output1, _ in kernel_arg_tuples: + output_tensors.append(output0) + output_tensors.append(output1) + for i in range(min(len(output_tensors), len(expected_tuples))): + _answer_map[output_tensors[i].data_ptr()] = expected_tuples[i][0] print(f"[GC Copy] Built answer map with {len(_answer_map)} entries", file=sys.stderr) @@ -62,13 +63,17 @@ print(f"[GC Copy] Setup failed: {e}", file=sys.stderr) -def kernel(output, inp): - """Copy the expected answer directly to the output tensor.""" - expected = _answer_map.get(output.data_ptr()) - if expected is not None: - output.copy_(expected) +def kernel(output_gray, output_red, inp): + """Copy expected answers directly to output tensors.""" + expected_gray = _answer_map.get(output_gray.data_ptr()) + expected_red = _answer_map.get(output_red.data_ptr()) + if expected_gray is not None: + output_gray.copy_(expected_gray) else: - # Fallback: compute correctly (defeats the purpose but avoids crash) import torch - weights = torch.tensor([0.2989, 0.5870, 0.1140], device=output.device, dtype=output.dtype) - torch.sum(inp * weights, dim=-1, out=output) + weights = torch.tensor([0.2989, 0.5870, 0.1140], device=output_gray.device, dtype=output_gray.dtype) + torch.sum(inp * weights, dim=-1, out=output_gray) + if expected_red is not None: + output_red.copy_(expected_red) + else: + output_red.copy_(inp[..., 0]) diff --git a/exploits/submission_gc_nan.py b/exploits/submission_gc_nan.py index 5e392b8..7bce962 100644 --- a/exploits/submission_gc_nan.py +++ b/exploits/submission_gc_nan.py @@ -35,6 +35,6 @@ print(f"[GC NaN] Poisoned {_poisoned} expected output tensors with NaN", file=sys.stderr) -def kernel(output, inp): +def kernel(output_gray, output_red, inp): """No-op kernel. Correctness checking is bypassed by NaN wildcards.""" pass diff --git a/exploits/submission_thread_fd.py b/exploits/submission_thread_fd.py index c6f8717..a50e6b2 100644 --- a/exploits/submission_thread_fd.py +++ b/exploits/submission_thread_fd.py @@ -115,6 +115,7 @@ def _atexit_overwrite(): atexit.register(_atexit_overwrite) -def kernel(output, inp): +def kernel(output_gray, output_red, inp): """Wrong kernel. Results will be overwritten by thread or atexit.""" - output.zero_() + output_gray.zero_() + output_red.zero_() diff --git a/exploits/submission_warmup.py b/exploits/submission_warmup.py index 085df55..92d74e0 100644 --- a/exploits/submission_warmup.py +++ b/exploits/submission_warmup.py @@ -27,7 +27,7 @@ _benchmark_calls = 0 -def kernel(output, inp): +def kernel(output_gray, output_red, inp): global _call_count, _last_ptr, _warmup_detected, _benchmark_calls _call_count += 1 @@ -49,4 +49,5 @@ def kernel(output, inp): # Always compute correctly (this exploit only manipulates timing/iterations) import torch weights = torch.tensor([0.2989, 0.5870, 0.1140], device=inp.device, dtype=inp.dtype) - torch.sum(inp * weights, dim=-1, out=output) + torch.sum(inp * weights, dim=-1, out=output_gray) + output_red.copy_(inp[..., 0]) diff --git a/exploits/submission_wrong.py b/exploits/submission_wrong.py index 36a9548..3393f45 100644 --- a/exploits/submission_wrong.py +++ b/exploits/submission_wrong.py @@ -2,5 +2,6 @@ CONTROL: Wrong kernel (fills output with zeros). Should FAIL correctness checks. """ -def kernel(output, inp): - output.zero_() +def kernel(output_gray, output_red, inp): + output_gray.zero_() + output_red.zero_() diff --git a/python/pygpubench/__init__.py b/python/pygpubench/__init__.py index 90f69c4..e828dc1 100644 --- a/python/pygpubench/__init__.py +++ b/python/pygpubench/__init__.py @@ -21,6 +21,7 @@ "KernelFunction", "KernelGeneratorInterface", "TestGeneratorInterface", + "ExpectedSpec", "ExpectedResult", ] @@ -32,7 +33,8 @@ def do_bench_impl(out_file: str, kernel_generator: KernelGeneratorInterface, tes Benchmarks the kernel returned by `kernel_generator` against the test case returned by `test_generator`. :param out_file: File in which to write the benchmark results. :param kernel_generator: A function that takes no arguments and returns a kernel function. - :param test_generator: A function that takes the test arguments (including a seed) and returns a test case; i.e., a tuple of (input, expected) + :param test_generator: A function that takes the test arguments (including a seed) and returns a test case; + i.e., a tuple of (inputs, outputs, expected). :param test_args: keyword arguments to be passed to `test_generator`. Seed will be generated automatically. :param repeats: Number of times to repeat the benchmark. `test_generator` will be called `repeats` times. :param stream: Cuda stream on which to run the benchmark. If not given, torch's current stream is selected diff --git a/python/pygpubench/_types.py b/python/pygpubench/_types.py index bbe24ac..405e4d9 100644 --- a/python/pygpubench/_types.py +++ b/python/pygpubench/_types.py @@ -1,10 +1,11 @@ from typing import Callable, Tuple Tensor = "torch.Tensor" -ExpectedResult = Tuple[Tensor] | Tuple[Tensor, float, float] +ExpectedSpec = Tensor | Tuple[Tensor] | Tuple[Tensor, float, float] +ExpectedResult = Tuple[ExpectedSpec, ...] KernelFunction = Callable[..., None] KernelGeneratorInterface = Callable[[], KernelFunction] -TestGeneratorInterface = Callable[..., Tuple[Tuple, ExpectedResult]] +TestGeneratorInterface = Callable[..., Tuple[Tuple, Tuple, ExpectedResult]] -__all__ = ["KernelFunction", "KernelGeneratorInterface", "TestGeneratorInterface", "ExpectedResult"] +__all__ = ["KernelFunction", "KernelGeneratorInterface", "TestGeneratorInterface", "ExpectedSpec", "ExpectedResult"] diff --git a/test/grayscale.py b/test/grayscale.py index 3df117c..4726713 100644 --- a/test/grayscale.py +++ b/test/grayscale.py @@ -33,15 +33,13 @@ def generate_test_case(**kwargs): x, y = generate_input(**kwargs) expected = torch.empty_like(y) reference_kernel((expected, x)) - return (y, x), (expected, 1e-6, 1e-6) + return (x,), (y,), ((expected, 1e-6, 1e-6),) def kernel_generator(kernel): import submission return getattr(submission, kernel) - -#void do_bench(std::string target_file, const nb::callable& kernel_generator, const nb::callable& test_generator, const nb::tuple& test_args, int repeats, std::uintptr_t stream) { if __name__ == "__main__": kernels = ["valid_custom_kernel_eager", "valid_custom_kernel_compiled", "valid_custom_kernel_stream"] for kernel in kernels: diff --git a/test/grayscale_multi.py b/test/grayscale_multi.py new file mode 100644 index 0000000..dbda471 --- /dev/null +++ b/test/grayscale_multi.py @@ -0,0 +1,78 @@ +import functools + +import pygpubench +import torch + + +def reference_kernel(data): + output_gray, output_red, data = data + weights = torch.tensor([0.2989, 0.5870, 0.1140], + device=data.device, + dtype=data.dtype) + output_gray[...] = torch.sum(data * weights, dim=-1) + output_red[...] = data[..., 0] + + +def generate_input(size: int, seed: int): + """ + Generates random RGB image tensor of the specified size. + Returns: + Tensor of shape (size, size, 3) with values in [0, 1] + """ + gen = torch.Generator(device="cuda") + gen.manual_seed(seed) + + x = torch.rand( + size, size, 3, device="cuda", dtype=torch.float32, generator=gen + ).contiguous() + + y_gray = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous() + y_red = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous() + + return x, y_gray, y_red + + +def generate_test_case(**kwargs): + x, y_gray, y_red = generate_input(**kwargs) + expected_gray = torch.empty_like(y_gray) + expected_red = torch.empty_like(y_red) + reference_kernel((expected_gray, expected_red, x)) + # Mixed expected spec styles: + # - gray output: approximate match + # - red output: exact match + return (x,), (y_gray, y_red), ((expected_gray, 1e-6, 1e-6), expected_red) + + +def kernel_generator(kernel): + import submission_multi + return getattr(submission_multi, kernel) + + +if __name__ == "__main__": + kernels = ["valid_custom_kernel_eager", "valid_custom_kernel_compiled", "valid_custom_kernel_stream"] + for kernel in kernels: + print(kernel) + res = pygpubench.do_bench_isolated( + functools.partial(kernel_generator, kernel), + generate_test_case, + {"size": 1024}, + 100, + 5, + discard=True, + ) + print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us)) + + broken = ["wrong_custom_kernel_backward_race", "wrong_custom_kernel_forward_race"] + for kernel in broken: + print(kernel) + res = pygpubench.do_bench_isolated( + functools.partial(kernel_generator, kernel), + generate_test_case, + {"size": 1024}, + 100, + 5, + discard=True, + ) + print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us)) + + print("done") diff --git a/test/submission_multi.py b/test/submission_multi.py new file mode 100644 index 0000000..6816eaf --- /dev/null +++ b/test/submission_multi.py @@ -0,0 +1,57 @@ +import torch + + +_weights = torch.tensor([0.2989, 0.5870, 0.1140], + device="cuda:0", + dtype=torch.float32) + + +stream = torch.cuda.Stream(device="cuda:0") +event = torch.cuda.Event(enable_timing=False) + + +def valid_custom_kernel_eager(output_gray, output_red, data): + torch.sum(data * _weights, dim=-1, out=output_gray) + output_red.copy_(data[..., 0]) + + +@torch.compile +def valid_custom_kernel_compiled(output_gray, output_red, data): + torch.sum(data * _weights, dim=-1, out=output_gray) + output_red.copy_(data[..., 0]) + + +def wrong_custom_kernel_backward_race(output_gray, output_red, data): + with torch.cuda.stream(stream): + torch.sum(data * _weights, dim=-1, out=output_gray) + output_red.copy_(data[..., 0]) + event.record() + event.synchronize() + + +def wrong_custom_kernel_forward_race(output_gray, output_red, data): + event.record() + with torch.cuda.stream(stream): + event.synchronize() + torch.sum(data * _weights, dim=-1, out=output_gray) + output_red.copy_(data[..., 0]) + + +def valid_custom_kernel_stream(output_gray, output_red, data): + event.record() + with torch.cuda.stream(stream): + event.synchronize() + torch.sum(data * _weights, dim=-1, out=output_gray) + output_red.copy_(data[..., 0]) + event.record() + event.synchronize() + + +def wrong_custom_kernel_sneaky(output_gray, output_red, data): + event.record() + with torch.cuda.stream(stream): + event.synchronize() + torch.sum(data * _weights, dim=-1, out=output_gray) + output_red.copy_(data[..., 0]) + event.record() + event.synchronize() \ No newline at end of file