diff --git a/README.md b/README.md
index 9053da8..19aad35 100644
--- a/README.md
+++ b/README.md
@@ -11,32 +11,34 @@ try to exploit benchmarking flaws to receive higher scores.
 To benchmark a kernel, two ingredients are needed:
 1. A function that _generates_ the kernel. This function takes no arguments and returns a callable. It is important that
    untrusted code, e.g., the user-supplied python module, is only imported inside this function.
-2. A function that generates test/benchmark inputs. This function takes a tuple of configuration parameters, as well as an
-   integer to seed the rng, as arguments. It returns two tuples: The first contains the inputs for the kernel and will
-   be used to call the kernel function, and the second contains the expected output and the required absolute and relative tolerance.
+2. A function that generates test/benchmark inputs. This function takes configuration parameters and
+   an integer seed for rng as arguments. It returns three tuples:
+   - inputs: kernel inputs
+   - outputs: output tensors passed to the kernel
+   - expected: one expected spec per output tensor
 
 ```python
 import torch
 import pygpubench
 
-def generate_input(*args):
+def generate_input(size, seed):
     ...
 
 def reference_kernel(args):
     ...
 
-def generate_test_case(args, seed):
-    x, y = generate_input(*args, seed)
+def generate_test_case(size, seed):
+    x, y = generate_input(size, seed)
     expected = torch.empty_like(y)
     reference_kernel((expected, x))
-    return (y, x), (expected, 1e-6, 1e-6)
+    return (x,), (y,), ((expected, 1e-6, 1e-6),)
 
 
 def kernel_generator():
     import submission
     return submission.kernel
 
-res = pygpubench.do_bench_isolated(kernel_generator, generate_test_case,  (1024,), 100, 5, discard=True)
+res = pygpubench.do_bench_isolated(kernel_generator, generate_test_case, {"size": 1024}, 100, 5, discard=True)
 print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us))
 ```
 For the full example see [grayscale.py](test/grayscale.py)
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index 9b0abde..a994a35 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -13,8 +13,8 @@ namespace nb = nanobind;
 
 void do_bench(std::string target_file, const nb::object& kernel_generator, const nb::object& test_generator, const nb::dict& test_kwargs, int repeats, std::uint64_t seed, std::uintptr_t stream, bool discard, bool unlink, bool nvtx) {
     BenchmarkManager mgr(std::move(target_file), seed, discard, unlink, nvtx);
-    auto [args, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, repeats);
-    mgr.do_bench_py(nb::cast<nb::callable>(kernel_generator), args, expected, reinterpret_cast<cudaStream_t>(stream));
+    auto [args, outputs, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, repeats);
+    mgr.do_bench_py(nb::cast<nb::callable>(kernel_generator), args, outputs, expected, reinterpret_cast<cudaStream_t>(stream));
 }
 
 
diff --git a/csrc/manager.cpp b/csrc/manager.cpp
index 326d77c..6d96676 100644
--- a/csrc/manager.cpp
+++ b/csrc/manager.cpp
@@ -52,14 +52,17 @@ BenchmarkManager::~BenchmarkManager() {
     cudaFree(mDeviceErrorCounter);
     for (auto& event : mStartEvents) cudaEventDestroy(event);
     for (auto& event : mEndEvents) cudaEventDestroy(event);
-    for (auto& exp: mExpectedOutputs) cudaFree(exp.Value);
+    for (auto& expected_per_test : mExpectedOutputs) {
+        for (auto& exp : expected_per_test) cudaFree(exp.Value);
+    }
 }
 
-std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
+std::tuple<std::vector<nb::tuple>, std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
     std::mt19937_64 rng(mSeed);
     std::uniform_int_distribution<std::uint64_t> dist(0, std::numeric_limits<std::uint64_t>::max());
     // generate one more input to handle warmup
-    std::vector<nb::tuple> kernel_args(repeats + 1);
+    std::vector<nb::tuple> call_args(repeats + 1);
+    std::vector<nb::tuple> outputs(repeats + 1);
     std::vector<nb::tuple> expected(repeats + 1);
     for (int i = 0; i < repeats + 1; i++) {
         // create new copy of the kwargs dict
@@ -74,23 +77,41 @@ std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setu
         call_kwargs["seed"] = dist(rng);
 
         auto gen = nb::cast<nb::tuple>(generate_test_case(**call_kwargs));
-        kernel_args[i] = nb::cast<nb::tuple>(gen[0]);
-        expected[i] = nb::cast<nb::tuple>(gen[1]);
+        if (gen.size() != 3) {
+            throw std::runtime_error("generate_test_case must return a 3-tuple: (inputs, outputs, expected)");
+        }
+
+        nb::tuple inputs = nb::cast<nb::tuple>(gen[0]);
+        outputs[i] = nb::cast<nb::tuple>(gen[1]);
+        expected[i] = nb::cast<nb::tuple>(gen[2]);
+
+        if (outputs[i].size() == 0) {
+            throw std::runtime_error("outputs tuple must not be empty");
+        }
+        if (expected[i].size() != outputs[i].size()) {
+            throw std::runtime_error("expected tuple size must match outputs tuple size");
+        }
+
+        PyObject* combined = PySequence_Concat(outputs[i].ptr(), inputs.ptr());
+        if (combined == nullptr) {
+            throw nb::python_error();
+        }
+        call_args[i] = nb::steal<nb::tuple>(combined);
     }
-    return std::make_pair(std::move(kernel_args), std::move(expected));
+    return std::make_tuple(std::move(call_args), std::move(outputs), std::move(expected));
 }
 
 bool can_convert_to_tensor(nb::handle obj) {
     return nb::isinstance<nb_cuda_array>(obj);
 }
 
-auto BenchmarkManager::make_shadow_args(const nb::tuple& args, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
+auto BenchmarkManager::make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
     std::vector<std::optional<ShadowArgument>> shadow_args(args.size());
     int nargs = args.size();
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_int_distribution<unsigned> canary_seed_dist(0,  0xffffffff);
-    for (int i = 1; i < nargs; i++) {
+    for (std::size_t i = first_input_idx; i < static_cast<std::size_t>(nargs); i++) {
         if (can_convert_to_tensor(args[i])) {
             nb_cuda_array arr = nb::cast<nb_cuda_array>(args[i]);
             void* shadow;
@@ -131,6 +152,39 @@ void BenchmarkManager::validate_result(Expected& expected, const nb_cuda_array&
     }
 }
 
+BenchmarkManager::Expected BenchmarkManager::parse_expected_spec(const nb::handle& obj) {
+    nb_cuda_array expected_array;
+    auto mode = BenchmarkManager::Expected::ExactMatch;
+    float rtol = 0.f;
+    float atol = 0.f;
+
+    if (nb::isinstance<nb_cuda_array>(obj)) {
+        expected_array = nb::cast<nb_cuda_array>(obj);
+    } else {
+        nb::tuple expected_tuple = nb::cast<nb::tuple>(obj);
+        if (expected_tuple.size() == 0) {
+            throw std::runtime_error("Expected spec tuple must not be empty");
+        }
+        if (expected_tuple.size() != 1 && expected_tuple.size() != 3) {
+            throw std::runtime_error("Expected spec tuple must have size 1 or 3");
+        }
+        expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);
+        if (expected_tuple.size() == 3) {
+            rtol = nb::cast<float>(expected_tuple[1]);
+            atol = nb::cast<float>(expected_tuple[2]);
+            mode = BenchmarkManager::Expected::ApproxMatch;
+        }
+    }
+
+    // copy expected values into memory not owned by torch, then wipe original
+    void* copy_mem;
+    CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
+    CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));
+
+    return {mode, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
+}
+
 void BenchmarkManager::clear_cache(cudaStream_t stream) {
     ::clear_cache(mDeviceDummyMemory, 2 * mL2CacheSize, mDiscardCache, stream);
 }
@@ -155,47 +209,50 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh
     return *this;
 }
 
-void BenchmarkManager::do_bench_py(const nb::callable& kernel_generator, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream) {
+void BenchmarkManager::do_bench_py(
+    const nb::callable& kernel_generator,
+    const std::vector<nb::tuple>& args,
+    const std::vector<nb::tuple>& output_tuples,
+    const std::vector<nb::tuple>& expected,
+    cudaStream_t stream
+) {
     if (args.size() < 5) {
         throw std::runtime_error("Not enough test cases to run benchmark");
     }
-    if (expected.size() != args.size()) {
-        throw std::runtime_error("Expected results and test case list do not have the same length");
+    if (output_tuples.size() != args.size() || expected.size() != args.size()) {
+        throw std::runtime_error("Expected results, outputs, and test case lists do not have the same length");
     }
     int calls = args.size() - 1;
 
-    // extract relevant infos from args and expected
-    // by convention, the first arg is the output tensor.
-    // TODO handle multiple outputs
-    std::vector<nb_cuda_array> outputs(args.size());
+    // extract relevant infos from outputs and expected
+    std::vector<std::vector<nb_cuda_array>> outputs(args.size());
     for (int i = 0; i < args.size(); i++) {
-        outputs.at(i) = nb::cast<nb_cuda_array>(args.at(i)[0]);
+        const nb::tuple& output_tuple = output_tuples.at(i);
+        outputs.at(i).reserve(output_tuple.size());
+        for (int j = 0; j < output_tuple.size(); j++) {
+            outputs.at(i).push_back(nb::cast<nb_cuda_array>(output_tuple[j]));
+        }
     }
 
     // Generate "shadow" copies of input arguments
     std::vector<ShadowArgumentList> shadow_arguments;
-    for (const auto & arg : args) {
-        shadow_arguments.emplace_back(make_shadow_args(arg, stream));
+    for (int i = 0; i < args.size(); i++) {
+        shadow_arguments.emplace_back(make_shadow_args(args.at(i), outputs.at(i).size(), stream));
     }
 
+    for (auto& expected_per_test : mExpectedOutputs) {
+        for (auto& exp : expected_per_test) cudaFree(exp.Value);
+    }
+    mExpectedOutputs.clear();
     mExpectedOutputs.resize(args.size());
     for (int i = 0; i < args.size(); i++) {
         const nb::tuple& expected_tuple = expected.at(i);
-        nb_cuda_array expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);
-
-        // make a copy of the expected result and put it in memory not owned by torch; overwrite the original
-        // so it cannot be read by cheating solutions.
-        void* copy_mem;
-        CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
-        CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));
-
-        if (expected.at(i).size() == 1) {
-            mExpectedOutputs.at(i) = {Expected::ExactMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), 0.f, 0.f};
-        } else {
-            float rtol = nb::cast<float>(expected_tuple[1]);
-            float atol = nb::cast<float>(expected_tuple[2]);
-            mExpectedOutputs.at(i) = {Expected::ApproxMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
+        if (expected_tuple.size() != outputs.at(i).size()) {
+            throw std::runtime_error("Expected tuple size must match outputs tuple size");
+        }
+        mExpectedOutputs.at(i).reserve(expected_tuple.size());
+        for (int j = 0; j < expected_tuple.size(); j++) {
+            mExpectedOutputs.at(i).push_back(parse_expected_spec(expected_tuple[j]));
         }
     }
 
@@ -324,7 +381,9 @@ void BenchmarkManager::do_bench_py(const nb::callable& kernel_generator, const s
         CUDA_CHECK(cudaEventRecord(mEndEvents.at(i), stream));
         // immediately after the kernel, launch the checking code; if there is some unsynced work done on another stream,
         // this increases the chance of detection.
-        validate_result(mExpectedOutputs.at(test_id), outputs.at(test_id), check_seed_generator(rng), stream);
+        for (std::size_t j = 0; j < outputs.at(test_id).size(); j++) {
+            validate_result(mExpectedOutputs.at(test_id).at(j), outputs.at(test_id).at(j), check_seed_generator(rng), stream);
+        }
     }
     nvtx_pop();
 
diff --git a/csrc/manager.h b/csrc/manager.h
index 64400f5..88dc815 100644
--- a/csrc/manager.h
+++ b/csrc/manager.h
@@ -8,6 +8,7 @@
 #include <functional>
 #include <chrono>
 #include <fstream>
+#include <tuple>
 #include <cuda_runtime.h>
 #include <optional>
 #include <nanobind/nanobind.h>
@@ -21,9 +22,30 @@ class BenchmarkManager {
 public:
     BenchmarkManager(std::string result_file, std::uint64_t seed, bool discard, bool unlink, bool nvtx);
     ~BenchmarkManager();
-    std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
-    void do_bench_py(const nb::callable& kernel_generator, const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected, cudaStream_t stream);
+    std::tuple<std::vector<nb::tuple>, std::vector<nb::tuple>, std::vector<nb::tuple>>
+    setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
+    void do_bench_py(
+        const nb::callable& kernel_generator,
+        const std::vector<nb::tuple>& args,
+        const std::vector<nb::tuple>& outputs,
+        const std::vector<nb::tuple>& expected,
+        cudaStream_t stream
+    );
 private:
+    double mWarmupSeconds = 1.0;
+    double mBenchmarkSeconds = 1.0;
+
+    std::vector<cudaEvent_t> mStartEvents;
+    std::vector<cudaEvent_t> mEndEvents;
+
+    std::chrono::high_resolution_clock::time_point mCPUStart;
+
+    int* mDeviceDummyMemory = nullptr;
+    int mL2CacheSize;
+    unsigned* mDeviceErrorCounter = nullptr;
+    bool mNVTXEnabled = false;
+    bool mDiscardCache = true;
+    std::uint64_t mSeed = -1;
     struct Expected {
         enum EMode {
             ExactMatch,
@@ -48,29 +70,14 @@ class BenchmarkManager {
 
     using ShadowArgumentList = std::vector<std::optional<ShadowArgument>>;
 
-    double mWarmupSeconds = 1.0;
-    double mBenchmarkSeconds = 1.0;
-
-    std::vector<cudaEvent_t> mStartEvents;
-    std::vector<cudaEvent_t> mEndEvents;
-
-    std::chrono::high_resolution_clock::time_point mCPUStart;
-
-    int* mDeviceDummyMemory = nullptr;
-    int mL2CacheSize;
-    unsigned* mDeviceErrorCounter = nullptr;
-    bool mNVTXEnabled = false;
-    bool mDiscardCache = true;
-    std::uint64_t mSeed = -1;
-    std::vector<Expected> mExpectedOutputs;
-
+    std::vector<std::vector<Expected>> mExpectedOutputs;
     std::ofstream mOutputFile;
 
-    static ShadowArgumentList make_shadow_args(const nb::tuple& args, cudaStream_t stream);
+    static ShadowArgumentList make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream);
+    static Expected parse_expected_spec(const nb::handle& obj);
 
     void nvtx_push(const char* name);
     void nvtx_pop();
-
     void validate_result(Expected& expected, const nb_cuda_array& result, unsigned seed, cudaStream_t stream);
     void clear_cache(cudaStream_t stream);
 };
diff --git a/exploits/benchmark.py b/exploits/benchmark.py
index 1e16823..9cd0987 100644
--- a/exploits/benchmark.py
+++ b/exploits/benchmark.py
@@ -12,12 +12,14 @@ def generate_test_case(size, seed):
     gen.manual_seed(seed)
 
     x = torch.rand(size, size, 3, device="cuda", dtype=torch.float32, generator=gen).contiguous()
-    y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+    y_gray = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+    y_red = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
 
     weights = torch.tensor([0.2989, 0.5870, 0.1140], device="cuda", dtype=torch.float32)
-    expected = torch.sum(x * weights, dim=-1).contiguous()
+    expected_gray = torch.sum(x * weights, dim=-1).contiguous()
+    expected_red = x[..., 0].contiguous()
 
-    return (y, x), (expected, 1e-6, 1e-6)
+    return (x,), (y_gray, y_red), ((expected_gray, 1e-6, 1e-6), expected_red)
 
 
 def kernel_generator(module_name):
diff --git a/exploits/submission_combined.py b/exploits/submission_combined.py
index 2a4a905..9115e62 100644
--- a/exploits/submission_combined.py
+++ b/exploits/submission_combined.py
@@ -41,7 +41,7 @@
 _in_benchmark = False
 
 
-def kernel(output, inp):
+def kernel(output_gray, output_red, inp):
     """
     During warmup: sleep to inflate time_estimate (fewer benchmark iterations).
     During benchmark: no-op (correctness bypassed by NaN injection).
diff --git a/exploits/submission_correct.py b/exploits/submission_correct.py
index 677e14c..24babde 100644
--- a/exploits/submission_correct.py
+++ b/exploits/submission_correct.py
@@ -5,5 +5,6 @@
 
 _weights = torch.tensor([0.2989, 0.5870, 0.1140], device="cuda", dtype=torch.float32)
 
-def kernel(output, inp):
-    torch.sum(inp * _weights, dim=-1, out=output)
+def kernel(output_gray, output_red, inp):
+    torch.sum(inp * _weights, dim=-1, out=output_gray)
+    output_red.copy_(inp[..., 0])
diff --git a/exploits/submission_error_counter.py b/exploits/submission_error_counter.py
index 53f06bf..932fd55 100644
--- a/exploits/submission_error_counter.py
+++ b/exploits/submission_error_counter.py
@@ -106,11 +106,12 @@ def _zero_loop():
             _cudart.cudaMemset(ctypes.c_void_p(_counter_ptr), 0, 4)
 
 
-def kernel(output, inp):
-    """Wrong kernel that zeros the output."""
+def kernel(output_gray, output_red, inp):
+    """Wrong kernel that zeros the output. Relies on error counter being zeroed."""
     global _zero_thread, _keep_zeroing
 
-    output.zero_()
+    output_gray.zero_()
+    output_red.zero_()
 
     # Start persistent zeroing thread on first kernel call
     if _zero_thread is None or not _zero_thread.is_alive():
diff --git a/exploits/submission_fd_overwrite.py b/exploits/submission_fd_overwrite.py
index 67558d1..51afc43 100644
--- a/exploits/submission_fd_overwrite.py
+++ b/exploits/submission_fd_overwrite.py
@@ -90,6 +90,7 @@ def _overwrite_results():
     print("[FD Overwrite] Registered atexit handler", file=sys.stderr)
 
 
-def kernel(output, inp):
+def kernel(output_gray, output_red, inp):
     """Wrong kernel (zeros output). Results will be overwritten by atexit handler."""
-    output.zero_()
+    output_gray.zero_()
+    output_red.zero_()
diff --git a/exploits/submission_gc_copy.py b/exploits/submission_gc_copy.py
index cc574cd..f893db9 100644
--- a/exploits/submission_gc_copy.py
+++ b/exploits/submission_gc_copy.py
@@ -4,7 +4,7 @@
 Severity: CRITICAL
 
 Strategy:
-  At import time, use gc to find both kernel arg tuples (output, input) and
+  At import time, use gc to find both kernel arg tuples (output0, output1, input) and
   expected result tuples (expected_tensor, rtol, atol). Build a mapping from
   output tensor data_ptr -> expected tensor. In the kernel, just copy the
   expected answer to the output.
@@ -28,33 +28,34 @@
     expected_tuples = []
 
     for obj in gc.get_objects():
-        if isinstance(obj, tuple):
-            if len(obj) == 2:
-                try:
-                    a, b = obj
-                    if (isinstance(a, torch.Tensor) and a.is_cuda
-                            and isinstance(b, torch.Tensor) and b.is_cuda
-                            and a.ndim == 2 and b.ndim == 3):  # output is 2D, input is 3D (grayscale)
-                        kernel_arg_tuples.append(obj)
-                except Exception:
-                    pass
-            elif len(obj) == 3:
-                try:
-                    a, b, c = obj
-                    if (isinstance(a, torch.Tensor) and a.is_cuda
-                            and isinstance(b, float) and isinstance(c, float)
-                            and 0 < b < 1 and 0 < c < 1):
-                        expected_tuples.append(obj)
-                except Exception:
-                    pass
+        if isinstance(obj, tuple) and len(obj) == 3:
+            try:
+                a, b, c = obj
+                if (isinstance(a, torch.Tensor) and a.is_cuda
+                        and isinstance(b, torch.Tensor) and b.is_cuda
+                        and isinstance(c, torch.Tensor) and c.is_cuda
+                        and a.ndim == 2 and b.ndim == 2 and c.ndim == 3):
+                    kernel_arg_tuples.append(obj)
+            except Exception:
+                pass
+            try:
+                a, b, c = obj
+                if (isinstance(a, torch.Tensor) and a.is_cuda
+                        and isinstance(b, float) and isinstance(c, float)
+                        and 0 < b < 1 and 0 < c < 1):
+                    expected_tuples.append(obj)
+            except Exception:
+                pass
 
     print(f"[GC Copy] Found {len(kernel_arg_tuples)} kernel arg tuples, {len(expected_tuples)} expected tuples", file=sys.stderr)
 
-    # Match by allocation order (heuristic)
-    for i in range(min(len(kernel_arg_tuples), len(expected_tuples))):
-        output_tensor = kernel_arg_tuples[i][0]
-        expected_tensor = expected_tuples[i][0]
-        _answer_map[output_tensor.data_ptr()] = expected_tensor
+    # Match by allocation order (heuristic) and flatten both outputs.
+    output_tensors = []
+    for output0, output1, _ in kernel_arg_tuples:
+        output_tensors.append(output0)
+        output_tensors.append(output1)
+    for i in range(min(len(output_tensors), len(expected_tuples))):
+        _answer_map[output_tensors[i].data_ptr()] = expected_tuples[i][0]
 
     print(f"[GC Copy] Built answer map with {len(_answer_map)} entries", file=sys.stderr)
 
@@ -62,13 +63,17 @@
     print(f"[GC Copy] Setup failed: {e}", file=sys.stderr)
 
 
-def kernel(output, inp):
-    """Copy the expected answer directly to the output tensor."""
-    expected = _answer_map.get(output.data_ptr())
-    if expected is not None:
-        output.copy_(expected)
+def kernel(output_gray, output_red, inp):
+    """Copy expected answers directly to output tensors."""
+    expected_gray = _answer_map.get(output_gray.data_ptr())
+    expected_red = _answer_map.get(output_red.data_ptr())
+    if expected_gray is not None:
+        output_gray.copy_(expected_gray)
     else:
-        # Fallback: compute correctly (defeats the purpose but avoids crash)
         import torch
-        weights = torch.tensor([0.2989, 0.5870, 0.1140], device=output.device, dtype=output.dtype)
-        torch.sum(inp * weights, dim=-1, out=output)
+        weights = torch.tensor([0.2989, 0.5870, 0.1140], device=output_gray.device, dtype=output_gray.dtype)
+        torch.sum(inp * weights, dim=-1, out=output_gray)
+    if expected_red is not None:
+        output_red.copy_(expected_red)
+    else:
+        output_red.copy_(inp[..., 0])
diff --git a/exploits/submission_gc_nan.py b/exploits/submission_gc_nan.py
index 5e392b8..7bce962 100644
--- a/exploits/submission_gc_nan.py
+++ b/exploits/submission_gc_nan.py
@@ -35,6 +35,6 @@
 print(f"[GC NaN] Poisoned {_poisoned} expected output tensors with NaN", file=sys.stderr)
 
 
-def kernel(output, inp):
+def kernel(output_gray, output_red, inp):
     """No-op kernel. Correctness checking is bypassed by NaN wildcards."""
     pass
diff --git a/exploits/submission_thread_fd.py b/exploits/submission_thread_fd.py
index c6f8717..a50e6b2 100644
--- a/exploits/submission_thread_fd.py
+++ b/exploits/submission_thread_fd.py
@@ -115,6 +115,7 @@ def _atexit_overwrite():
     atexit.register(_atexit_overwrite)
 
 
-def kernel(output, inp):
+def kernel(output_gray, output_red, inp):
     """Wrong kernel. Results will be overwritten by thread or atexit."""
-    output.zero_()
+    output_gray.zero_()
+    output_red.zero_()
diff --git a/exploits/submission_warmup.py b/exploits/submission_warmup.py
index 085df55..92d74e0 100644
--- a/exploits/submission_warmup.py
+++ b/exploits/submission_warmup.py
@@ -27,7 +27,7 @@
 _benchmark_calls = 0
 
 
-def kernel(output, inp):
+def kernel(output_gray, output_red, inp):
     global _call_count, _last_ptr, _warmup_detected, _benchmark_calls
     _call_count += 1
 
@@ -49,4 +49,5 @@ def kernel(output, inp):
     # Always compute correctly (this exploit only manipulates timing/iterations)
     import torch
     weights = torch.tensor([0.2989, 0.5870, 0.1140], device=inp.device, dtype=inp.dtype)
-    torch.sum(inp * weights, dim=-1, out=output)
+    torch.sum(inp * weights, dim=-1, out=output_gray)
+    output_red.copy_(inp[..., 0])
diff --git a/exploits/submission_wrong.py b/exploits/submission_wrong.py
index 36a9548..3393f45 100644
--- a/exploits/submission_wrong.py
+++ b/exploits/submission_wrong.py
@@ -2,5 +2,6 @@
 CONTROL: Wrong kernel (fills output with zeros). Should FAIL correctness checks.
 """
 
-def kernel(output, inp):
-    output.zero_()
+def kernel(output_gray, output_red, inp):
+    output_gray.zero_()
+    output_red.zero_()
diff --git a/python/pygpubench/__init__.py b/python/pygpubench/__init__.py
index 90f69c4..e828dc1 100644
--- a/python/pygpubench/__init__.py
+++ b/python/pygpubench/__init__.py
@@ -21,6 +21,7 @@
     "KernelFunction",
     "KernelGeneratorInterface",
     "TestGeneratorInterface",
+    "ExpectedSpec",
     "ExpectedResult",
 ]
 
@@ -32,7 +33,8 @@ def do_bench_impl(out_file: str, kernel_generator: KernelGeneratorInterface, tes
     Benchmarks the kernel returned by `kernel_generator` against the test case returned by `test_generator`.
     :param out_file: File in which to write the benchmark results.
     :param kernel_generator: A function that takes no arguments and returns a kernel function.
-    :param test_generator: A function that takes the test arguments (including a seed) and returns a test case; i.e., a tuple of (input, expected)
+    :param test_generator: A function that takes the test arguments (including a seed) and returns a test case;
+    i.e., a tuple of (inputs, outputs, expected).
     :param test_args: keyword arguments to be passed to `test_generator`. Seed will be generated automatically.
     :param repeats: Number of times to repeat the benchmark. `test_generator` will be called `repeats` times.
     :param stream: Cuda stream on which to run the benchmark. If not given, torch's current stream is selected
diff --git a/python/pygpubench/_types.py b/python/pygpubench/_types.py
index bbe24ac..405e4d9 100644
--- a/python/pygpubench/_types.py
+++ b/python/pygpubench/_types.py
@@ -1,10 +1,11 @@
 from typing import Callable, Tuple
 
 Tensor = "torch.Tensor"
-ExpectedResult = Tuple[Tensor] | Tuple[Tensor, float, float]
+ExpectedSpec = Tensor | Tuple[Tensor] | Tuple[Tensor, float, float]
+ExpectedResult = Tuple[ExpectedSpec, ...]
 
 KernelFunction = Callable[..., None]
 KernelGeneratorInterface = Callable[[], KernelFunction]
-TestGeneratorInterface = Callable[..., Tuple[Tuple, ExpectedResult]]
+TestGeneratorInterface = Callable[..., Tuple[Tuple, Tuple, ExpectedResult]]
 
-__all__ = ["KernelFunction", "KernelGeneratorInterface", "TestGeneratorInterface", "ExpectedResult"]
+__all__ = ["KernelFunction", "KernelGeneratorInterface", "TestGeneratorInterface", "ExpectedSpec", "ExpectedResult"]
diff --git a/test/grayscale.py b/test/grayscale.py
index 3df117c..4726713 100644
--- a/test/grayscale.py
+++ b/test/grayscale.py
@@ -33,15 +33,13 @@ def generate_test_case(**kwargs):
     x, y = generate_input(**kwargs)
     expected = torch.empty_like(y)
     reference_kernel((expected, x))
-    return (y, x), (expected, 1e-6, 1e-6)
+    return (x,), (y,), ((expected, 1e-6, 1e-6),)
 
 
 def kernel_generator(kernel):
     import submission
     return getattr(submission, kernel)
 
-
-#void do_bench(std::string target_file, const nb::callable& kernel_generator, const nb::callable& test_generator, const nb::tuple& test_args, int repeats, std::uintptr_t stream) {
 if __name__ == "__main__":
     kernels = ["valid_custom_kernel_eager", "valid_custom_kernel_compiled", "valid_custom_kernel_stream"]
     for kernel in kernels:
diff --git a/test/grayscale_multi.py b/test/grayscale_multi.py
new file mode 100644
index 0000000..dbda471
--- /dev/null
+++ b/test/grayscale_multi.py
@@ -0,0 +1,78 @@
+import functools
+
+import pygpubench
+import torch
+
+
+def reference_kernel(data):
+    output_gray, output_red, data = data
+    weights = torch.tensor([0.2989, 0.5870, 0.1140],
+                           device=data.device,
+                           dtype=data.dtype)
+    output_gray[...] = torch.sum(data * weights, dim=-1)
+    output_red[...] = data[..., 0]
+
+
+def generate_input(size: int, seed: int):
+    """
+    Generates random RGB image tensor of the specified size.
+    Returns:
+        Tensor of shape (size, size, 3) with values in [0, 1]
+    """
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+
+    x = torch.rand(
+        size, size, 3, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
+    y_gray = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+    y_red = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+
+    return x, y_gray, y_red
+
+
+def generate_test_case(**kwargs):
+    x, y_gray, y_red = generate_input(**kwargs)
+    expected_gray = torch.empty_like(y_gray)
+    expected_red = torch.empty_like(y_red)
+    reference_kernel((expected_gray, expected_red, x))
+    # Mixed expected spec styles:
+    # - gray output: approximate match
+    # - red output: exact match
+    return (x,), (y_gray, y_red), ((expected_gray, 1e-6, 1e-6), expected_red)
+
+
+def kernel_generator(kernel):
+    import submission_multi
+    return getattr(submission_multi, kernel)
+
+
+if __name__ == "__main__":
+    kernels = ["valid_custom_kernel_eager", "valid_custom_kernel_compiled", "valid_custom_kernel_stream"]
+    for kernel in kernels:
+        print(kernel)
+        res = pygpubench.do_bench_isolated(
+            functools.partial(kernel_generator, kernel),
+            generate_test_case,
+            {"size": 1024},
+            100,
+            5,
+            discard=True,
+        )
+        print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us))
+
+    broken = ["wrong_custom_kernel_backward_race", "wrong_custom_kernel_forward_race"]
+    for kernel in broken:
+        print(kernel)
+        res = pygpubench.do_bench_isolated(
+            functools.partial(kernel_generator, kernel),
+            generate_test_case,
+            {"size": 1024},
+            100,
+            5,
+            discard=True,
+        )
+        print("❌" if res.errors else "✅", pygpubench.basic_stats(res.time_us))
+
+    print("done")
diff --git a/test/submission_multi.py b/test/submission_multi.py
new file mode 100644
index 0000000..6816eaf
--- /dev/null
+++ b/test/submission_multi.py
@@ -0,0 +1,57 @@
+import torch
+
+
+_weights = torch.tensor([0.2989, 0.5870, 0.1140],
+                        device="cuda:0",
+                        dtype=torch.float32)
+
+
+stream = torch.cuda.Stream(device="cuda:0")
+event = torch.cuda.Event(enable_timing=False)
+
+
+def valid_custom_kernel_eager(output_gray, output_red, data):
+    torch.sum(data * _weights, dim=-1, out=output_gray)
+    output_red.copy_(data[..., 0])
+
+
+@torch.compile
+def valid_custom_kernel_compiled(output_gray, output_red, data):
+    torch.sum(data * _weights, dim=-1, out=output_gray)
+    output_red.copy_(data[..., 0])
+
+
+def wrong_custom_kernel_backward_race(output_gray, output_red, data):
+    with torch.cuda.stream(stream):
+        torch.sum(data * _weights, dim=-1, out=output_gray)
+        output_red.copy_(data[..., 0])
+        event.record()
+    event.synchronize()
+
+
+def wrong_custom_kernel_forward_race(output_gray, output_red, data):
+    event.record()
+    with torch.cuda.stream(stream):
+        event.synchronize()
+        torch.sum(data * _weights, dim=-1, out=output_gray)
+        output_red.copy_(data[..., 0])
+
+
+def valid_custom_kernel_stream(output_gray, output_red, data):
+    event.record()
+    with torch.cuda.stream(stream):
+        event.synchronize()
+        torch.sum(data * _weights, dim=-1, out=output_gray)
+        output_red.copy_(data[..., 0])
+        event.record()
+    event.synchronize()
+
+
+def wrong_custom_kernel_sneaky(output_gray, output_red, data):
+    event.record()
+    with torch.cuda.stream(stream):
+        event.synchronize()
+        torch.sum(data * _weights, dim=-1, out=output_gray)
+        output_red.copy_(data[..., 0])
+        event.record()
+    event.synchronize()
\ No newline at end of file