|
| 1 | +// Copyright (C) 2026 Intel Corporation |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +#include "../utils/load_image.hpp" |
| 5 | +#include "../utils/model_yaml.hpp" |
| 6 | +#include "../utils/ut_modules_base.hpp" |
| 7 | +#include "../utils/utils.hpp" |
| 8 | + |
| 9 | +#include "modeling/models/qwen3_5/processing_qwen3_5.hpp" |
| 10 | + |
| 11 | +// Parameters for test: |
| 12 | +// string: mode ("text" or "vl") |
| 13 | +// string: device |
| 14 | +using test_params = std::tuple<std::string, std::string>; |
| 15 | +using namespace ov::genai::module; |
| 16 | + |
| 17 | +// ============================================================================ |
| 18 | +// Test fixture |
| 19 | +// ============================================================================ |
| 20 | + |
| 21 | +class LLMInferenceSDPAModuleTest : public ModuleTestBase, |
| 22 | + public ::testing::TestWithParam<test_params> { |
| 23 | +private: |
| 24 | + std::string _mode; |
| 25 | + std::string _device; |
| 26 | + |
| 27 | + std::string _module_name = "sdpa_llm_infer"; |
| 28 | + |
| 29 | +public: |
| 30 | + static std::string get_test_case_name(const testing::TestParamInfo<test_params>& obj) { |
| 31 | + const auto& mode = std::get<0>(obj.param); |
| 32 | + const auto& device = std::get<1>(obj.param); |
| 33 | + std::string result; |
| 34 | + result += "Device_" + device; |
| 35 | + result += "_Mode_" + mode; |
| 36 | + return result; |
| 37 | + } |
| 38 | + |
| 39 | + void SetUp() override { |
| 40 | + REGISTER_TEST_NAME(); |
| 41 | + std::tie(_mode, _device) = GetParam(); |
| 42 | + } |
| 43 | + |
| 44 | + void TearDown() override {} |
| 45 | + |
| 46 | +protected: |
| 47 | + // ------------------------------------------------------------------ |
| 48 | + // YAML generation — configure the module for text or VL mode |
| 49 | + // ------------------------------------------------------------------ |
| 50 | + std::string get_yaml_content() override { |
| 51 | + YAML::Node config; |
| 52 | + config["global_context"]["model_type"] = "qwen3_5"; |
| 53 | + |
| 54 | + YAML::Node pipeline_modules = config["pipeline_modules"]; |
| 55 | + |
| 56 | + YAML::Node llm_sdpa; |
| 57 | + llm_sdpa["type"] = "LLMInferenceSDPAModule"; |
| 58 | + llm_sdpa["device"] = _device; |
| 59 | + llm_sdpa["description"] = "LLM Inference SDPA Module test."; |
| 60 | + |
| 61 | + // ---- Inputs ---- |
| 62 | + YAML::Node inputs; |
| 63 | + // input_ids is always required |
| 64 | + inputs.push_back(input_node("input_ids", "OVTensor")); |
| 65 | + |
| 66 | + if (_mode == "vl") { |
| 67 | + inputs.push_back(input_node("visual_embeds", "OVTensor")); |
| 68 | + inputs.push_back(input_node("visual_pos_mask", "OVTensor")); |
| 69 | + inputs.push_back(input_node("grid_thw", "OVTensor")); |
| 70 | + } |
| 71 | + llm_sdpa["inputs"] = inputs; |
| 72 | + |
| 73 | + // ---- Outputs ---- |
| 74 | + YAML::Node outputs; |
| 75 | + outputs.push_back(output_node("generated_text", "String")); |
| 76 | + llm_sdpa["outputs"] = outputs; |
| 77 | + |
| 78 | + // ---- Params ---- |
| 79 | + YAML::Node params; |
| 80 | + params["model_path"] = TEST_MODEL::Qwen3_5_0_8B(); |
| 81 | + params["max_new_tokens"] = "16"; |
| 82 | + llm_sdpa["params"] = params; |
| 83 | + |
| 84 | + pipeline_modules[_module_name] = llm_sdpa; |
| 85 | + return YAML::Dump(config); |
| 86 | + } |
| 87 | + |
| 88 | + // ------------------------------------------------------------------ |
| 89 | + // Input preparation |
| 90 | + // ------------------------------------------------------------------ |
| 91 | + ov::AnyMap prepare_inputs() override { |
| 92 | + ov::AnyMap inputs; |
| 93 | + |
| 94 | + if (_mode == "text") { |
| 95 | + // Text mode: provide tokenized input_ids directly. |
| 96 | + // Token ids for a simple prompt (placeholder ids). |
| 97 | + std::vector<int64_t> token_values = {9707}; // e.g. "Hello" |
| 98 | + ov::Tensor input_ids(ov::element::i64, {1, token_values.size()}); |
| 99 | + std::copy(token_values.begin(), token_values.end(), input_ids.data<int64_t>()); |
| 100 | + inputs["input_ids"] = input_ids; |
| 101 | + |
| 102 | + } else { |
| 103 | + // VL mode: provide input_ids with vision placeholder tokens, |
| 104 | + // plus synthetic visual embeddings. |
| 105 | + // |
| 106 | + // Sequence layout: [vision_start] [img_tok × N_vis] [vision_end] [text_tok] |
| 107 | + // |
| 108 | + // Read model config to get hidden_size and special token ids dynamically |
| 109 | + auto model_cfg = ov::genai::modeling::models::Qwen3_5Config::from_json_file( |
| 110 | + TEST_MODEL::Qwen3_5_0_8B()); |
| 111 | + const int64_t vision_start_id = model_cfg.vision_start_token_id; |
| 112 | + const int64_t image_token_id = model_cfg.image_token_id; |
| 113 | + const int64_t vision_end_id = model_cfg.vision_end_token_id; |
| 114 | + constexpr int64_t text_token_id = 9707; |
| 115 | + |
| 116 | + // grid_thw = [1, 4, 4] with spatial_merge_size=2 gives: |
| 117 | + // n_vis = T * (H/merge) * (W/merge) = 1 * 2 * 2 = 4 |
| 118 | + constexpr size_t n_vis = 4; // visual tokens count |
| 119 | + constexpr size_t seq_len = 2 + n_vis + 1; // start + vis×N + end + text |
| 120 | + const size_t hidden = static_cast<size_t>(model_cfg.text.hidden_size); |
| 121 | + |
| 122 | + // input_ids: [vision_start, img, img, img, img, vision_end, text] |
| 123 | + ov::Tensor input_ids(ov::element::i64, {1, seq_len}); |
| 124 | + auto* ids = input_ids.data<int64_t>(); |
| 125 | + ids[0] = vision_start_id; |
| 126 | + for (size_t i = 0; i < n_vis; ++i) |
| 127 | + ids[1 + i] = image_token_id; |
| 128 | + ids[1 + n_vis] = vision_end_id; |
| 129 | + ids[2 + n_vis] = text_token_id; |
| 130 | + inputs["input_ids"] = input_ids; |
| 131 | + |
| 132 | + // visual_embeds: compact visual embeddings [n_vis, hidden] |
| 133 | + // scatter_visual_embeds expects 2D [V, H] (flat across all images) |
| 134 | + ov::Tensor visual_embeds(ov::element::f32, {n_vis, hidden}); |
| 135 | + std::fill_n(visual_embeds.data<float>(), n_vis * hidden, 0.01f); |
| 136 | + inputs["visual_embeds"] = visual_embeds; |
| 137 | + |
| 138 | + // visual_pos_mask: boolean [1, seq_len] — true at visual token positions |
| 139 | + ov::Tensor visual_pos_mask(ov::element::boolean, {1, seq_len}); |
| 140 | + auto* mask = visual_pos_mask.data<bool>(); |
| 141 | + mask[0] = false; // vision_start |
| 142 | + for (size_t i = 0; i < n_vis; ++i) |
| 143 | + mask[1 + i] = true; // image tokens |
| 144 | + mask[1 + n_vis] = false; // vision_end |
| 145 | + mask[2 + n_vis] = false; // text |
| 146 | + inputs["visual_pos_mask"] = visual_pos_mask; |
| 147 | + |
| 148 | + // grid_thw: [N_images, 3] — (T, H, W) for 3D MRoPE |
| 149 | + // Single image: T=1, H=4, W=4, spatial_merge_size=2 → n_vis = 1×2×2 = 4 |
| 150 | + ov::Tensor grid_thw(ov::element::i64, {1, 3}); |
| 151 | + auto* thw = grid_thw.data<int64_t>(); |
| 152 | + thw[0] = 1; // T |
| 153 | + thw[1] = 4; // H |
| 154 | + thw[2] = 4; // W |
| 155 | + inputs["grid_thw"] = grid_thw; |
| 156 | + } |
| 157 | + return inputs; |
| 158 | + } |
| 159 | + |
| 160 | + // ------------------------------------------------------------------ |
| 161 | + // Output verification — check generated text against known output |
| 162 | + // ------------------------------------------------------------------ |
| 163 | + void check_outputs(ov::genai::module::ModulePipeline& pipe) override { |
| 164 | + auto generated_text = pipe.get_output("generated_text").as<std::string>(); |
| 165 | + |
| 166 | + if (std::getenv("VERBOSE")) |
| 167 | + std::cout << "[TEST:" << _mode << "] generated_text = [" << generated_text << "]" << std::endl; |
| 168 | + |
| 169 | + EXPECT_FALSE(generated_text.empty()) << "Generated text should not be empty"; |
| 170 | + |
| 171 | + // Greedy decoding is deterministic — verify expected substrings. |
| 172 | + if (_mode == "text") { |
| 173 | + EXPECT_NE(generated_text.find(". 2020"), std::string::npos) |
| 174 | + << "Text-mode output should contain '. 2020', got: " << generated_text; |
| 175 | + } else { |
| 176 | + EXPECT_NE(generated_text.find("uality of the"), std::string::npos) |
| 177 | + << "VL-mode output should contain 'uality of the', got: " << generated_text; |
| 178 | + } |
| 179 | + } |
| 180 | +}; |
| 181 | + |
| 182 | +// ============================================================================ |
| 183 | +// Test cases |
| 184 | +// ============================================================================ |
| 185 | + |
| 186 | +TEST_P(LLMInferenceSDPAModuleTest, ModuleTest) { |
| 187 | + run(); |
| 188 | +} |
| 189 | + |
| 190 | +// ============================================================================ |
| 191 | +// Parameterised instantiation |
| 192 | +// ============================================================================ |
| 193 | + |
| 194 | +static std::vector<test_params> g_test_params = { |
| 195 | + // Text mode |
| 196 | + {"text", TEST_MODEL::get_device()}, |
| 197 | + // VL mode |
| 198 | + {"vl", TEST_MODEL::get_device()}, |
| 199 | +}; |
| 200 | + |
| 201 | +INSTANTIATE_TEST_SUITE_P(ModuleTestSuite, |
| 202 | + LLMInferenceSDPAModuleTest, |
| 203 | + ::testing::ValuesIn(g_test_params), |
| 204 | + LLMInferenceSDPAModuleTest::get_test_case_name); |
0 commit comments