Skip to content

Commit 63693d5

Browse files
committed
Merge branch '2025.4-new-arch' into master_tr_module_genai
Signed-off-by: Zhang, Xiaolin <xiaolin.zhang@intel.com>
2 parents f8e400e + 9114927 commit 63693d5

273 files changed

Lines changed: 78572 additions & 193 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ add_subdirectory(src)
9595
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")
9696
add_subdirectory(samples)
9797
endif()
98+
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/ov_ops_tests" AND ENABLE_NEW_ARCH_OPS)
99+
add_subdirectory(ov_ops_tests)
100+
endif()
98101
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tools/continuous_batching" AND ENABLE_TOOLS)
99102
add_subdirectory(tools/continuous_batching)
100103
endif()

cmake/features.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ option(ENABLE_SAMPLES "Enable samples build" ON)
99
option(ENABLE_TESTS "Enable tests build" ON)
1010
option(ENABLE_TOOLS "Enable tools build" ON)
1111
option(ENABLE_GGUF "Enable support for GGUF format" ON)
12+
option(ENABLE_SAFETENSORS "Enable support for Safetensors format" ON)
13+
option(ENABLE_NEW_ARCH_OPS "Enable new-arch specific OpenVINO operations (LinearAttention, MOE, FusedMLP)" ON)
1214
option(ENABLE_XGRAMMAR "Enable support for structured output generation with xgrammar backend" ON)
1315

1416
# Disable building samples for NPM package

ov_ops_tests/CMakeLists.txt

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright (C) 2018-2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
5+
6+
# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
7+
# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
8+
# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
9+
# - Linux_x86: `<openvino_dir>/runtime/lib/intel64/`
10+
# - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/`
11+
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR)
12+
if(ARCH_DIR MATCHES "amd64.*|x86_64.*|AMD64.*")
13+
set(ARCH_DIR intel64)
14+
elseif(ARCH_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)")
15+
if(APPLE)
16+
set(ARCH_DIR "arm64")
17+
else()
18+
set(ARCH_DIR "aarch64")
19+
endif()
20+
elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64
21+
OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
22+
set(ARCH_DIR intel64)
23+
endif()
24+
25+
if(WIN32 OR APPLE)
26+
if(GENERATOR_IS_MULTI_CONFIG_VAR)
27+
set(ARCH_DIR ${ARCH_DIR}/$<CONFIG>)
28+
else()
29+
set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE})
30+
endif()
31+
endif()
32+
33+
# Extract two last digits from OpenVINOGenAI_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols.
34+
string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${OpenVINOGenAI_VERSION_MAJOR})
35+
36+
#
37+
# Include source dirs
38+
#
39+
40+
add_subdirectory(cpp)

ov_ops_tests/cpp/CMakeLists.txt

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright (C) 2018-2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
5+
file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.c")
6+
list(APPEND SOURCE_FILES "${CMAKE_CURRENT_BINARY_DIR}/version.cpp")
7+
8+
# Dependencies
9+
10+
include(FetchContent)
11+
12+
set(XGRAMMAR_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/sampling/structured_output/xgrammar_backend.hpp
13+
${CMAKE_CURRENT_SOURCE_DIR}/src/sampling/structured_output/xgrammar_backend.cpp)
14+
list(REMOVE_ITEM SOURCE_FILES ${XGRAMMAR_SOURCES})
15+
16+
# generate version files
17+
configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.hpp.in"
18+
"${CMAKE_CURRENT_BINARY_DIR}/openvino/genai/version.hpp" @ONLY)
19+
20+
configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/version.cpp.in"
21+
"${CMAKE_CURRENT_BINARY_DIR}/version.cpp" @ONLY)
22+
23+
24+
function(add_test_executable target_name)
25+
add_executable(${target_name} ${target_name}.cpp)
26+
target_link_libraries(${target_name} PRIVATE openvino::runtime openvino::core::dev openvino::runtime::dev openvino::threading)
27+
set_target_properties(${target_name} PROPERTIES
28+
# Ensure out-of-box LC_RPATH on macOS with SIP
29+
INSTALL_RPATH_USE_LINK_PATH ON)
30+
install(TARGETS ${target_name}
31+
RUNTIME DESTINATION test_bin/
32+
COMPONENT samples_bin
33+
EXCLUDE_FROM_ALL)
34+
endfunction()
35+
36+
set (TEST_LIST
37+
fused_mlp_test
38+
moe_q41_moe3gemm_test
39+
test_moe_layer)
40+
41+
foreach(testcase IN LISTS TEST_LIST)
42+
add_test_executable(${testcase})
43+
endforeach()
44+
45+
# test_moe_layer needs access to gguf_utils headers and gguflib
46+
# Also needs to compile building_blocks.cpp and gguf.cpp which contain required functions
47+
target_sources(test_moe_layer PRIVATE
48+
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/gguf_utils/building_blocks.cpp"
49+
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/gguf_utils/gguf.cpp"
50+
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/gguf_utils/gguf_quants.cpp")
51+
target_include_directories(test_moe_layer PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src")
52+
if(ENABLE_GGUF AND TARGET gguflib)
53+
target_link_libraries(test_moe_layer PRIVATE gguflib)
54+
target_compile_definitions(test_moe_layer PRIVATE ENABLE_GGUF)
55+
endif()
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#include <openvino/openvino.hpp>
2+
#include "openvino/opsets/opset13.hpp"
3+
#include "openvino/opsets/opset10.hpp"
4+
#include "openvino/op/fused_mlp.hpp"
5+
#include <fstream>
6+
#include <iostream>
7+
#include <random>
8+
#include <numeric>
9+
#include <algorithm>
10+
11+
static float sigmoid(float x) {
12+
return 1.0f / (1.0f + std::exp(-x));
13+
}
14+
15+
static std::vector<float> matmul(const std::vector<float>& a, const std::vector<float>& b, int64_t m, int64_t k, int64_t n) {
16+
std::vector<float> c(static_cast<size_t>(m * n), 0.0f);
17+
for (int64_t i = 0; i < m; ++i) {
18+
for (int64_t j = 0; j < n; ++j) {
19+
float acc = 0.0f;
20+
for (int64_t kk = 0; kk < k; ++kk) {
21+
acc += a[static_cast<size_t>(i * k + kk)] * b[static_cast<size_t>(kk * n + j)];
22+
}
23+
c[static_cast<size_t>(i * n + j)] = acc;
24+
}
25+
}
26+
return c;
27+
}
28+
29+
30+
std::vector<float> create_random_f32_vector(ov::Shape tensor_shape) {
31+
// Calculate the total number of elements
32+
size_t total_elements = std::accumulate(tensor_shape.begin(),
33+
tensor_shape.end(),
34+
1,
35+
std::multiplies<size_t>());
36+
37+
// 2. Create and Fill a C++ Vector with Random Data
38+
39+
// Use a standard C++ vector to hold the data
40+
std::vector<float> input_data(total_elements);
41+
42+
std::vector<float> input_zero_data(total_elements);
43+
44+
// Setup the random number generator
45+
// Mersenne Twister engine for high-quality pseudo-random numbers
46+
std::mt19937 generator(std::random_device{}());
47+
48+
// Uniform distribution between -1.0 and 1.0 (a common range for input testing)
49+
std::uniform_real_distribution<float> distribution(-0.5f, 0.5f);
50+
51+
// Fill the vector with random floats
52+
std::generate(input_data.begin(), input_data.end(),
53+
[&]() { return distribution(generator); });
54+
55+
return input_data;
56+
}
57+
58+
59+
void test_shape(
60+
size_t b, size_t s, size_t ic, size_t oc,
61+
std::vector<float> gate_w_f32_val,
62+
std::vector<float> up_w_f32_val,
63+
std::vector<float> down_w_f32_val,
64+
ov::InferRequest ir,
65+
ov::InferRequest ir_ref
66+
) {
67+
printf("----------------- test case: [b, s, ic, oc] = [%d, %d, %d, %d]\n", b, s, ic, oc);
68+
const size_t mb = b * s;
69+
70+
// fused mlp run
71+
ov::Shape input_shape{b, s, ic, 1};
72+
auto x_vals = create_random_f32_vector(input_shape);
73+
ov::Tensor input_tensor(ov::element::f32, input_shape);
74+
std::copy(x_vals.begin(), x_vals.end(), input_tensor.data<float>());
75+
76+
ir.set_input_tensor(0, input_tensor);
77+
ir.infer();
78+
auto output = ir.get_output_tensor(0);
79+
std::vector<float> actual(output.data<float>(), output.data<float>() + output.get_size());
80+
81+
// ov ref model run
82+
ov::Shape input_shape_ref{b, s, ic};
83+
ov::Tensor input_tensor_ref(ov::element::f32, input_shape_ref);
84+
std::copy(x_vals.begin(), x_vals.end(), input_tensor_ref.data<float>());
85+
86+
ir_ref.set_input_tensor(0, input_tensor_ref);
87+
ir_ref.infer();
88+
auto output_ref = ir_ref.get_output_tensor(0);
89+
std::vector<float> ov_gpu_ref(output_ref.data<float>(), output_ref.data<float>() + output_ref.get_size());
90+
91+
// ref
92+
auto gate = matmul(x_vals, gate_w_f32_val, mb, ic, oc);
93+
auto up = matmul(x_vals, up_w_f32_val, mb, ic, oc);
94+
95+
std::vector<float> swish(static_cast<size_t>(mb * oc), 0.0f);
96+
for (size_t i = 0; i < swish.size(); ++i) {
97+
swish[i] = gate[i] * sigmoid(gate[i]);
98+
}
99+
100+
std::vector<float> hidden(static_cast<size_t>(mb * oc), 0.0f);
101+
for (size_t i = 0; i < hidden.size(); ++i) {
102+
hidden[i] = swish[i] * up[i];
103+
}
104+
105+
auto ref = matmul(hidden, down_w_f32_val, mb, oc, ic);
106+
107+
108+
for (size_t i = 0; i < 10; i++) {
109+
std::cout << "i: " << i << ", fused_mlp: " << std::fixed << std::setprecision(3) << actual[i] << ", \t ov gpu ref: " << std::fixed << std::setprecision(3) << ov_gpu_ref[i] << ", \t CPU ref: " << std::fixed << std::setprecision(3) << ref[i] << std::endl;
110+
}
111+
112+
// const float absolute_error_threshold = 5e-2f;
113+
const float absolute_error_threshold = 1.0f;
114+
115+
if (actual.size() != ref.size() || actual.size() != ov_gpu_ref.size()) {
116+
printf("Output size mismatch, fused_mlp: %d, ov gpu ref: %d, cpu ref: %d\n", actual.size(), ov_gpu_ref.size(), ref.size());
117+
}
118+
119+
for (size_t i = 0; i < ref.size(); ++i) {
120+
if (fabs(actual[i] - ref[i]) > absolute_error_threshold || fabs(ov_gpu_ref[i] - actual[i]) > absolute_error_threshold) {
121+
std::cout << "Mismatch detected at index: " << i << ", fused_mlp: " << actual[i] << ", cpu ref: " << ref[i] << std::endl;
122+
exit(0);
123+
}
124+
}
125+
126+
std::cout << "\nPassed" << std::endl;
127+
}
128+
129+
130+
int main(int argc, char* argv[]) {
131+
// Shapes (bfyx encoded):
132+
// X: [B, S, IC, 1]
133+
// W_gate: [IC, OC, 1, 1]
134+
// W_up: [IC, OC, 1, 1]
135+
// W_down: [OC, IC, 1, 1]
136+
const int64_t ic = 2560;
137+
const int64_t oc = 9728;
138+
139+
ov::PartialShape input_shape_dynamic({-1, -1, ic, 1});
140+
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape_dynamic);
141+
auto input_f16 = std::make_shared<ov::op::v0::Convert>(input, ov::element::f16);
142+
143+
ov::Shape gate_w_shape{ic, oc};
144+
ov::Shape up_w_shape{ic, oc};
145+
ov::Shape down_w_shape{oc, ic};
146+
147+
std::vector<float> gate_w_f32_val = create_random_f32_vector(gate_w_shape);
148+
std::vector<float> up_w_f32_val = create_random_f32_vector(up_w_shape);
149+
std::vector<float> down_w_f32_val = create_random_f32_vector(down_w_shape);
150+
151+
ov::Tensor gate_w = ov::Tensor(ov::element::f32, gate_w_shape);
152+
ov::Tensor up_w = ov::Tensor(ov::element::f32, up_w_shape);
153+
ov::Tensor down_w = ov::Tensor(ov::element::f32, down_w_shape);
154+
155+
std::copy(gate_w_f32_val.begin(), gate_w_f32_val.end(), gate_w.data<float>());
156+
std::copy(up_w_f32_val.begin(), up_w_f32_val.end(), up_w.data<float>());
157+
std::copy(down_w_f32_val.begin(), down_w_f32_val.end(), down_w.data<float>());
158+
159+
auto gate_w_node = std::make_shared<ov::op::v0::Constant>(gate_w);
160+
auto up_w_node = std::make_shared<ov::op::v0::Constant>(up_w);
161+
auto down_w_node = std::make_shared<ov::op::v0::Constant>(down_w);
162+
163+
auto gate_w_f16 = std::make_shared<ov::op::v0::Convert>(gate_w_node, ov::element::f16);
164+
auto up_w_f16 = std::make_shared<ov::op::v0::Convert>(up_w_node, ov::element::f16);
165+
auto down_w_f16 = std::make_shared<ov::op::v0::Convert>(down_w_node, ov::element::f16);
166+
167+
auto mlp_node = std::make_shared<ov::op::internal::FusedMLP>(input_f16, gate_w_f16, up_w_f16, down_w_f16);
168+
auto mlp_node_f32 = std::make_shared<ov::op::v0::Convert>(mlp_node, ov::element::f32);
169+
auto result = std::make_shared<ov::op::v0::Result>(mlp_node_f32);
170+
171+
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{input}, "fused_mlp_model");
172+
173+
ov::Core core;
174+
auto compiled_model = core.compile_model(model, "GPU");
175+
auto ireq = compiled_model.create_infer_request();
176+
177+
// ref model use matmul & swish
178+
ov::PartialShape input_shape_ref{-1, -1, ic};
179+
auto input_ref = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape_ref);
180+
181+
std::shared_ptr<ov::Node> gate_proj = std::make_shared<ov::op::v0::MatMul>(input_ref, gate_w_node, false, false);
182+
auto silu = std::make_shared<ov::op::v4::Swish>(gate_proj);
183+
auto up_proj = std::make_shared<ov::op::v0::MatMul>(input_ref, up_w_node, false, false);
184+
auto mul = std::make_shared<ov::op::v1::Multiply>(
185+
silu, up_proj, ov::op::AutoBroadcastType::NUMPY);
186+
auto down_proj = std::make_shared<ov::op::v0::MatMul>(mul, down_w_node, false, false);
187+
auto result_ref = std::make_shared<ov::op::v0::Result>(down_proj);
188+
189+
auto model_ref = std::make_shared<ov::Model>(ov::ResultVector{result_ref}, ov::ParameterVector{input_ref}, "mlp_model_ref");
190+
191+
auto compiled_model_ref = core.compile_model(model_ref, "GPU");
192+
auto ireq_ref = compiled_model_ref.create_infer_request();
193+
194+
test_shape(1, 2, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref);
195+
test_shape(1, 1, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref);
196+
test_shape(2, 2, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref);
197+
test_shape(2, 1, ic, oc, gate_w_f32_val, up_w_f32_val, down_w_f32_val, ireq, ireq_ref);
198+
}

0 commit comments

Comments
 (0)