pygpubench/csrc/manager.h at 875dc9d4c84c541221f41c200747c9c2899df5b2 · gpu-mode/pygpubench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
// Copyright (c) 2026 Erik Schultheis
// SPDX-License-Identifier: Apache-2.0
//

#ifndef PYGPUBENCH_SRC_MANAGER_H
#define PYGPUBENCH_SRC_MANAGER_H

#include <functional>
#include <chrono>
#include <fstream>
#include <tuple>
#include <cuda_runtime.h>
#include <optional>
#include <nanobind/nanobind.h>
#include "nanobind/ndarray.h"

namespace nb = nanobind;

using nb_cuda_array = nb::ndarray<nb::c_contig, nb::device::cuda>;

class BenchmarkManager {
public:
    BenchmarkManager(std::string result_file, std::uint64_t seed, bool discard, bool unlink, bool nvtx);
    ~BenchmarkManager();
    std::tuple<std::vector<nb::tuple>, std::vector<nb::tuple>, std::vector<nb::tuple>>
    setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats);
    void do_bench_py(
        const nb::callable& kernel_generator,
        const std::vector<nb::tuple>& args,
        const std::vector<nb::tuple>& outputs,
        const std::vector<nb::tuple>& expected,
        cudaStream_t stream
    );
private:
    double mWarmupSeconds = 1.0;
    double mBenchmarkSeconds = 1.0;

    std::vector<cudaEvent_t> mStartEvents;
    std::vector<cudaEvent_t> mEndEvents;

    std::chrono::high_resolution_clock::time_point mCPUStart;

    int* mDeviceDummyMemory = nullptr;
    int mL2CacheSize;
    unsigned* mDeviceErrorCounter = nullptr;
    bool mNVTXEnabled = false;
    bool mDiscardCache = true;
    std::uint64_t mSeed = -1;
    struct Expected {
        enum EMode {
            ExactMatch,
            ApproxMatch
        } Mode;
        void* Value = nullptr;
        std::size_t Size;
        nb::dlpack::dtype DType;
        float ATol;
        float RTol;
    };

    struct ShadowArgument {
        nb_cuda_array Original;
        void* Shadow = nullptr;
        unsigned Seed = -1;
        ShadowArgument(nb_cuda_array original, void* shadow, unsigned seed);
        ~ShadowArgument();
        ShadowArgument(ShadowArgument&& other) noexcept;
        ShadowArgument& operator=(ShadowArgument&& other) noexcept;
    };

    using ShadowArgumentList = std::vector<std::optional<ShadowArgument>>;

    std::vector<std::vector<Expected>> mExpectedOutputs;
    std::ofstream mOutputFile;

    static ShadowArgumentList make_shadow_args(const nb::tuple& args, std::size_t first_input_idx, cudaStream_t stream);
    static Expected parse_expected_spec(const nb::handle& obj);

    void nvtx_push(const char* name);
    void nvtx_pop();
    void validate_result(Expected& expected, const nb_cuda_array& result, unsigned seed, cudaStream_t stream);
    void clear_cache(cudaStream_t stream);
};

#endif //PYGPUBENCH_SRC_MANAGER_H