|
| 1 | +#include <openvino/openvino.hpp> |
| 2 | +#include "openvino/opsets/opset13.hpp" |
| 3 | +#include "openvino/opsets/opset10.hpp" |
| 4 | +#include "openvino/op/fused_mlp.hpp" |
| 5 | +#include <fstream> |
| 6 | +#include <iostream> |
| 7 | +#include <random> |
| 8 | +#include <numeric> |
| 9 | +#include <algorithm> |
| 10 | + |
| 11 | +static float sigmoid(float x) { |
| 12 | + return 1.0f / (1.0f + std::exp(-x)); |
| 13 | +} |
| 14 | + |
| 15 | +static std::vector<float> matmul(const std::vector<float>& a, const std::vector<float>& b, int64_t m, int64_t k, int64_t n) { |
| 16 | + std::vector<float> c(static_cast<size_t>(m * n), 0.0f); |
| 17 | + for (int64_t i = 0; i < m; ++i) { |
| 18 | + for (int64_t j = 0; j < n; ++j) { |
| 19 | + float acc = 0.0f; |
| 20 | + for (int64_t kk = 0; kk < k; ++kk) { |
| 21 | + acc += a[static_cast<size_t>(i * k + kk)] * b[static_cast<size_t>(kk * n + j)]; |
| 22 | + } |
| 23 | + c[static_cast<size_t>(i * n + j)] = acc; |
| 24 | + } |
| 25 | + } |
| 26 | + return c; |
| 27 | +} |
| 28 | + |
| 29 | + |
| 30 | +std::vector<float> create_random_f32_vector(ov::Shape tensor_shape) { |
| 31 | + // Calculate the total number of elements |
| 32 | + size_t total_elements = std::accumulate(tensor_shape.begin(), |
| 33 | + tensor_shape.end(), |
| 34 | + 1, |
| 35 | + std::multiplies<size_t>()); |
| 36 | + |
| 37 | + // 2. Create and Fill a C++ Vector with Random Data |
| 38 | + |
| 39 | + // Use a standard C++ vector to hold the data |
| 40 | + std::vector<float> input_data(total_elements); |
| 41 | + |
| 42 | + std::vector<float> input_zero_data(total_elements); |
| 43 | + |
| 44 | + // Setup the random number generator |
| 45 | + // Mersenne Twister engine for high-quality pseudo-random numbers |
| 46 | + std::mt19937 generator(std::random_device{}()); |
| 47 | + |
| 48 | + // Uniform distribution between -1.0 and 1.0 (a common range for input testing) |
| 49 | + std::uniform_real_distribution<float> distribution(-0.5f, 0.5f); |
| 50 | + |
| 51 | + // Fill the vector with random floats |
| 52 | + std::generate(input_data.begin(), input_data.end(), |
| 53 | + [&]() { return distribution(generator); }); |
| 54 | + |
| 55 | + return input_data; |
| 56 | +} |
| 57 | + |
| 58 | + |
| 59 | +void test_shape( |
| 60 | + size_t b, size_t s, size_t ic, size_t oc, |
| 61 | + std::vector<float> gate_w_f32_val, |
| 62 | + std::vector<float> up_w_f32_val, |
| 63 | + std::vector<float> down_w_f32_val, |
| 64 | + ov::InferRequest ir, |
| 65 | + ov::InferRequest ir_ref |
| 66 | +) { |
| 67 | + printf("----------------- test case: [b, s, ic, oc] = [%d, %d, %d, %d]\n", b, s, ic, oc); |
| 68 | + const size_t mb = b * s; |
| 69 | + |
| 70 | + // fused mlp run |
| 71 | + ov::Shape input_shape{b, s, ic, 1}; |
| 72 | + auto x_vals = create_random_f32_vector(input_shape); |
| 73 | + ov::Tensor input_tensor(ov::element::f32, input_shape); |
| 74 | + std::copy(x_vals.begin(), x_vals.end(), input_tensor.data<float>()); |
| 75 | + |
| 76 | + ir.set_input_tensor(0, input_tensor); |
| 77 | + ir.infer(); |
| 78 | + auto output = ir.get_output_tensor(0); |
| 79 | + std::vector<float> actual(output.data<float>(), output.data<float>() + output.get_size()); |
| 80 | + |
| 81 | + // ov ref model run |
| 82 | + ov::Shape input_shape_ref{b, s, ic}; |
| 83 | + ov::Tensor input_tensor_ref(ov::element::f32, input_shape_ref); |
| 84 | + std::copy(x_vals.begin(), x_vals.end(), input_tensor_ref.data<float>()); |
| 85 | + |
| 86 | + ir_ref.set_input_tensor(0, input_tensor_ref); |
| 87 | + ir_ref.infer(); |
| 88 | + auto output_ref = ir_ref.get_output_tensor(0); |
| 89 | + std::vector<float> ov_gpu_ref(output_ref.data<float>(), output_ref.data<float>() + output_ref.get_size()); |
| 90 | + |
| 91 | + // ref |
| 92 | + auto gate = matmul(x_vals, gate_w_f32_val, mb, ic, oc); |
| 93 | + auto up = matmul(x_vals, up_w_f32_val, mb, ic, oc); |
| 94 | + |
| 95 | + std::vector<float> swish(static_cast<size_t>(mb * oc), 0.0f); |
| 96 | + for (size_t i = 0; i < swish.size(); ++i) { |
| 97 | + swish[i] = gate[i] * sigmoid(gate[i]); |
| 98 | + } |
| 99 | + |
| 100 | + std::vector<float> hidden(static_cast<size_t>(mb * oc), 0.0f); |
| 101 | + for (size_t i = 0; i < hidden.size(); ++i) { |
| 102 | + hidden[i] = swish[i] * up[i]; |
| 103 | + } |
| 104 | + |
| 105 | + auto ref = matmul(hidden, down_w_f32_val, mb, oc, ic); |
| 106 | + |
| 107 | + |
| 108 | + for (size_t i = 0; i < 10; i++) { |
| 109 | + std::cout << "i: " << i << ", fused_mlp: " << std::fixed << std::setprecision(3) << actual[i] << ", \t ov gpu ref: " << std::fixed << std::setprecision(3) << ov_gpu_ref[i] << ", \t CPU ref: " << std::fixed << std::setprecision(3) << ref[i] << std::endl; |
| 110 | + } |
| 111 | + |
| 112 | + // const float absolute_error_threshold = 5e-2f; |
| 113 | + const float absolute_error_threshold = 1.0f; |
| 114 | + |
| 115 | + if (actual.size() != ref.size() || actual.size() != ov_gpu_ref.size()) { |
| 116 | + printf("Output size mismatch, fused_mlp: %d, ov gpu ref: %d, cpu ref: %d\n", actual.size(), ov_gpu_ref.size(), ref.size()); |
| 117 | + } |
| 118 | + |
| 119 | + for (size_t i = 0; i < ref.size(); ++i) { |
| 120 | + if (fabs(actual[i] - ref[i]) > absolute_error_threshold || fabs(ov_gpu_ref[i] - actual[i]) > absolute_error_threshold) { |
| 121 | + std::cout << "Mismatch detected at index: " << i << ", fused_mlp: " << actual[i] << ", cpu ref: " << ref[i] << std::endl; |
| 122 | + exit(0); |
| 123 | + } |
| 124 | + } |
| 125 | + |
| 126 | + std::cout << "\nPassed" << std::endl; |
| 127 | +} |
| 128 | + |
| 129 | + |
| 130 | +int main(int argc, char* argv[]) { |
| 131 | + // Shapes (bfyx encoded): |
| 132 | + // X: [B, S, IC, 1] |
| 133 | + // W_gate: [IC, OC, 1, 1] |
| 134 | + // W_up: [IC, OC, 1, 1] |
| 135 | + // W_down: [OC, IC, 1, 1] |
| 136 | + const int64_t ic = 2560; |
| 137 | + const int64_t oc = 9728; |
| 138 | + |
| 139 | + ov::PartialShape input_shape_dynamic({-1, -1, ic, 1}); |
| 140 | + auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape_dynamic); |
| 141 | + auto input_f16 = std::make_shared<ov::op::v0::Convert>(input, ov::element::f16); |
| 142 | + |
| 143 | + ov::Shape gate_w_shape{ic, oc}; |
| 144 | + ov::Shape up_w_shape{ic, oc}; |
| 145 | + ov::Shape down_w_shape{oc, ic}; |
| 146 | + |
| 147 | + std::vector<float> gate_w_f32_val = create_random_f32_vector(gate_w_shape); |
| 148 | + std::vector<float> up_w_f32_val = create_random_f32_vector(up_w_shape); |
| 149 | + std::vector<float> down_w_f32_val = create_random_f32_vector(down_w_shape); |
| 150 | + |
| 151 | + ov::Tensor gate_w = ov::Tensor(ov::element::f32, gate_w_shape); |
| 152 | + ov::Tensor up_w = ov::Tensor(ov::element::f32, up_w_shape); |
| 153 | + ov::Tensor down_w = ov::Tensor(ov::element::f32, down_w_shape); |
| 154 | + |
| 155 | + std::copy(gate_w_f32_val.begin(), gate_w_f32_val.end(), gate_w.data<float>()); |
| 156 | + std::copy(up_w_f32_val.begin(), up_w_f32_val.end(), up_w.data<float>()); |
| 157 | + std::copy(down_w_f32_val.begin(), down_w_f32_val.end(), down_w.data<float>()); |
| 158 | + |
| 159 | + auto gate_w_node = std::make_shared<ov::op::v0::Constant>(gate_w); |
| 160 | + auto up_w_node = std::make_shared<ov::op::v0::Constant>(up_w); |
| 161 | + auto down_w_node = std::make_shared<ov::op::v0::Constant>(down_w); |
| 162 | + |
| 163 | + auto gate_w_f16 = std::make_shared<ov::op::v0::Convert>(gate_w_node, ov::element::f16); |
| 164 | + auto up_w_f16 = std::make_shared<ov::op::v0::Convert>(up_w_node, ov::element::f16); |
| 165 | + auto down_w_f16 = std::make_shared<ov::op::v0::Convert>(down_w_node, ov::element::f16); |
| 166 | + |
| 167 | + auto mlp_node = std::make_shared<ov::op::internal::FusedMLP>(input_f16, gate_w_f16, up_w_f16, down_w_f16); |
| 168 | + auto mlp_node_f32 = std::make_shared<ov::op::v0::Convert>(mlp_node, ov::element::f32); |
| 169 | + auto result = std::make_shared<ov::op::v0::Result>(mlp_node_f32); |
| 170 | + |
| 171 | + auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{input}, "fused_mlp_model"); |
| 172 | + |
| 173 | + ov::Core core; |
| 174 | + auto compiled_model = core.compile_model(model, "GPU"); |
| 175 | + auto ireq = compiled_model.create_infer_request(); |
| 176 | + |
| 177 | + // ref model use matmul & swish |
| 178 | + ov::PartialShape input_shape_ref{-1, -1, ic}; |
| 179 | + auto input_ref = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape_ref); |
| 180 | + |
| 181 | + std::shared_ptr<ov::Node> gate_proj = std::make_shared<ov::op::v0::MatMul>(input_ref, gate_w_node, false, false); |
| 182 | + auto silu = std::make_shared<ov::op::v4::Swish>(gate_proj); |
| 183 | + auto up_proj = std::make_shared<ov::op::v0::MatMul>(input_ref, up_w_node, false, false); |
| 184 | + auto mul = std::make_shared<ov::op::v1::Multiply>( |
| 185 | + silu, up_proj, ov::op::AutoBroadcastType::NUMPY); |
| 186 | + auto down_proj = std::make_shared<ov::op::v0::MatMul>(mul, down_w_node, false, false); |
| 187 | + auto result_ref = std::make_shared<ov::op::v0::Result>(down_proj); |
| 188 | + |
| 189 | + auto model_ref = std::make_shared<ov::Model>(ov::ResultVector{result_ref}, ov::ParameterVector{input_ref}, "mlp_model_ref"); |
| 190 | + |
| 191 | + auto compiled_model_ref = core.compile_model(model_ref, "GPU"); |
| 192 | + auto ireq_ref = compiled_model_ref.create_infer_request(); |
| 193 | + |
| 194 | + test_shape(1, 2, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref); |
| 195 | + test_shape(1, 1, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref); |
| 196 | + test_shape(2, 2, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref); |
| 197 | + test_shape(2, 1, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref); |
| 198 | +} |
0 commit comments