Skip to content

[Test] Moe/test2 #30198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 33 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
aa10cbe
add perf
luo-cheng2021 Feb 11, 2025
3a80081
perf for sdpa/pa
luo-cheng2021 Feb 13, 2025
d209f03
add git ignore
luo-cheng2021 Mar 26, 2025
45bf16e
insert if for moe expert
luo-cheng2021 Mar 31, 2025
795e323
add moeexpert support
luo-cheng2021 Apr 2, 2025
d7f2602
fix moexpert precision is always f32
luo-cheng2021 Apr 2, 2025
7f4b901
add moeexpert support for gpu
luo-cheng2021 Apr 2, 2025
76a7d5b
opt: 1, simplify subgraph inside moeexpert; 2, only compute skip flag…
luo-cheng2021 Apr 4, 2025
6079269
opt: remove nonzero->split from subgraph into moeexpert for gpu
luo-cheng2021 Apr 5, 2025
c385c8f
Support Qwen3 rms kernel for input with dynamic padding
riverlijunjie Mar 31, 2025
ada754a
Add test case
riverlijunjie Mar 31, 2025
38ded44
WA: moe_expert wait all inputs ready
luo-cheng2021 Apr 7, 2025
f84303e
fix incorrect output shape computation
luo-cheng2021 Apr 8, 2025
df0ca20
add fast path for expert mask computation if no padding
luo-cheng2021 Apr 8, 2025
00e7d9a
qwen3 moe compile model opt, from 150s to 70s in LNL (#66)
riverlijunjie Apr 9, 2025
019262b
Move FuseMoeExpert2 ahead of CommonOptimizations to decrease compilin…
ceciliapeng2011 Apr 9, 2025
994c094
not use subgraph for moeexpert
luo-cheng2021 Apr 10, 2025
22c93ee
fix scale/zp layout; first expert should not be inplace
luo-cheng2021 Apr 11, 2025
7c872a0
merge all experts into one op
luo-cheng2021 Apr 12, 2025
75b9683
Optimize gather and index_add performance
riverlijunjie Apr 13, 2025
2d8eb4e
fix out_of_resource error on lunarlake
luo-cheng2021 Apr 14, 2025
06e436c
Move weigts from usm_host to usm_device memory
riverlijunjie Apr 16, 2025
02f2331
Add ITT for MoE
riverlijunjie Apr 17, 2025
b6b5f1d
Optimize BMG first token due to index_add kernel
riverlijunjie Apr 17, 2025
9383141
opt: merge all experts into one for batch1
luo-cheng2021 Apr 18, 2025
c7ef4ea
opt: cl code for mlp_*
luo-cheng2021 Apr 18, 2025
48b64a3
Set CMAKE_COMPILE_WARNING_AS_ERROR OFF
peterchen-intel Apr 11, 2025
76e6ed7
change weight back to ba
luo-cheng2021 Apr 19, 2025
8471f6b
small tune for lunarlake
luo-cheng2021 Apr 21, 2025
8ee0ed3
Merge remote-tracking branch 'luochen/luocheng/qwen3_moe2' into HEAD
peterchen-intel Apr 21, 2025
818ba1b
fuse onehot into moe
luo-cheng2021 Apr 21, 2025
eed40eb
not wait gpu for batch1
luo-cheng2021 Apr 21, 2025
774cff2
Merge remote-tracking branch 'luochen/luocheng/qwen3_moe2' into HEAD
peterchen-intel Apr 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_*
[Bb]uild*/
cmake-build*

.cache
# but ensure we don't skip Python files
!__init__.py
!__main__.py
Expand All @@ -15,6 +15,7 @@ cmake-build*
*.idea
.vscode
.vs/
out/
.vsconan/
.DS_Store
**/tags
Expand Down
7 changes: 2 additions & 5 deletions cmake/developer_package/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,8 @@ else()
ov_option(USE_BUILD_TYPE_SUBFOLDER "Create dedicated sub-folder per build type for output binaries" ON)
endif()

if(DEFINED ENV{CI_BUILD_NUMBER} AND NOT (CMAKE_CROSSCOMPILING AND CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.4))
set(CMAKE_COMPILE_WARNING_AS_ERROR_DEFAULT ON)
else()
set(CMAKE_COMPILE_WARNING_AS_ERROR_DEFAULT OFF)
endif()
set(CMAKE_COMPILE_WARNING_AS_ERROR_DEFAULT OFF)
set(CMAKE_COMPILE_WARNING_AS_ERROR OFF)

ov_option (CMAKE_COMPILE_WARNING_AS_ERROR "Enable warnings as errors" ${CMAKE_COMPILE_WARNING_AS_ERROR_DEFAULT})

Expand Down
79 changes: 79 additions & 0 deletions src/common/transformations/include/ov_ops/moe_expert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"
#include "openvino/op/util/sub_graph_base.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace op {
namespace internal {
/// \brief MOE expert
///
/// \note MOE
class TRANSFORMATIONS_API MOEExpert : public ov::op::util::SubGraphOp {
public:
OPENVINO_OP("MOEExpert", "ie_internal_opset", ov::op::util::SubGraphOp);

MOEExpert() = default;

struct Config {
size_t topk = 0;
size_t expert_num = 0;
size_t hidden_size = 0;
size_t expert_no = 0;
bool has_non_zero = true;
};

MOEExpert(const OutputVector& args, const Config& config, const std::shared_ptr<ov::Model>& body);

const Config& get_config() const;
void set_config(const Config& config);

bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;

private:
Config m_config{};
};

class TRANSFORMATIONS_API MOEExpert2 : public ov::op::Op {
public:
OPENVINO_OP("MOEExpert2", "ie_internal_opset", ov::op::util::SubGraphOp);

MOEExpert2() = default;

struct Config {
size_t topk = 0;
size_t expert_num = 0;
size_t hidden_size = 0;
size_t expert_no = 0;
};

MOEExpert2(const OutputVector& args, const Config& config, const std::vector<std::shared_ptr<ov::Model>>& body);

const Config& get_config() const;
void set_config(const Config& config);
const std::vector<std::shared_ptr<ov::Model>> get_body() const { return m_body; }
std::vector<std::shared_ptr<ov::Model>> get_body() { return m_body; }
void add_body(int expert_no, std::shared_ptr<ov::Model> model) {
OPENVINO_ASSERT(expert_no == static_cast<int>(m_body.size()));
m_body.push_back(model);
}

bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;

private:
Config m_config{};
std::vector<std::shared_ptr<ov::Model>> m_body;
};

} // namespace internal
} // namespace op
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/pass/matcher_pass.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace pass {

class TRANSFORMATIONS_API MoeExpert2If;
class TRANSFORMATIONS_API FuseMoeExpert;
class TRANSFORMATIONS_API FuseMoeExpert2;
class TRANSFORMATIONS_API FuseMoeExpertPlain;
class TRANSFORMATIONS_API FuseMoeExpertOneHot;

} // namespace pass
} // namespace ov

class ov::pass::MoeExpert2If : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("MoeExpert2If");
MoeExpert2If();
};

class ov::pass::FuseMoeExpert : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("FuseMoeExpert");
FuseMoeExpert();
};

class ov::pass::FuseMoeExpert2 : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("FuseMoeExpert2");
FuseMoeExpert2();
};

class ov::pass::FuseMoeExpertPlain : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("FuseMoeExpert3");
FuseMoeExpertPlain();
};

class ov::pass::FuseMoeExpertOneHot : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("FuseMoeExpertOneHot");
FuseMoeExpertOneHot();
};
121 changes: 121 additions & 0 deletions src/common/transformations/src/ov_ops/moe_expert.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ov_ops/moe_expert.hpp"

#include "itt.hpp"

namespace ov {
namespace op {
namespace internal {

MOEExpert::MOEExpert(const OutputVector& args, const Config& cfg, const std::shared_ptr<ov::Model>& body) : SubGraphOp(args), m_config(cfg) {
SubGraphOp::set_function(body);
if (cfg.has_non_zero) {
for (size_t i = 0; i < body->get_parameters().size(); ++i)
m_input_descriptions[0].push_back(std::make_shared<InvariantInputDescription>(i, i));
} else {
m_input_descriptions[0].push_back(std::make_shared<InvariantInputDescription>(0, 0));
m_input_descriptions[0].push_back(std::make_shared<InvariantInputDescription>(2, 1));
m_input_descriptions[0].push_back(std::make_shared<InvariantInputDescription>(3, 2));
}

for (size_t i = 0; i < body->get_output_size(); ++i)
m_output_descriptions[0].push_back(std::make_shared<BodyOutputDescription>(i, i));
constructor_validate_and_infer_types();
}

const MOEExpert::Config& MOEExpert::get_config() const {
return m_config;
}

void MOEExpert::set_config(const Config& config) {
m_config = config;
}

std::shared_ptr<ov::Node> MOEExpert::clone_with_new_inputs(const ov::OutputVector& new_args) const {
INTERNAL_OP_SCOPE(internal_MOEExpert_clone_with_new_inputs);
check_new_args_count(this, new_args);
return std::make_shared<MOEExpert>(new_args, m_config, get_function()->clone());
}

void MOEExpert::validate_and_infer_types() {
INTERNAL_OP_SCOPE(internal_MOEExpert_validate_and_infer_types);
OPENVINO_ASSERT(get_input_size() == 4, "MOEExpert must have 4 inputs whereas it has ", get_input_size());
OPENVINO_ASSERT(get_output_size() == 1, "MOEExpert must have 1 output whereas it has ", get_output_size());
const auto& body = get_function();
OPENVINO_ASSERT(body, "MOEExpert must have initialized body");
// with non zero can infer subgraph
if (m_config.has_non_zero)
validate_and_infer_type_body(body, m_input_descriptions[0]);

set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
}

bool MOEExpert::visit_attributes(ov::AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(internal_MOEExpert_visit_attributes);
visitor.start_structure("config");

visitor.on_attribute("topk", m_config.topk);
visitor.on_attribute("expert_num", m_config.expert_num);
visitor.on_attribute("hidden_size", m_config.hidden_size);
visitor.on_attribute("expert_no", m_config.expert_no);
visitor.on_attribute("has_non_zero", m_config.has_non_zero);
visitor.finish_structure();

visitor.on_attribute("body", m_bodies[0]);
visitor.on_attribute("input_descriptions", m_input_descriptions[0]);
return true;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
MOEExpert2::MOEExpert2(const OutputVector& args, const Config& cfg, const std::vector<std::shared_ptr<ov::Model>>& body) : Op(args), m_config(cfg), m_body(body) {
constructor_validate_and_infer_types();
}

const MOEExpert2::Config& MOEExpert2::get_config() const {
return m_config;
}

void MOEExpert2::set_config(const Config& config) {
m_config = config;
}

std::shared_ptr<ov::Node> MOEExpert2::clone_with_new_inputs(const ov::OutputVector& new_args) const {
INTERNAL_OP_SCOPE(internal_MOEExpert2_clone_with_new_inputs);
check_new_args_count(this, new_args);
std::vector<std::shared_ptr<ov::Model>> models(m_body.size());
for (size_t i = 0; i < m_body.size(); i++) {
models[i] = m_body[i]->clone();
}
return std::make_shared<MOEExpert2>(new_args, m_config, models);
}

void MOEExpert2::validate_and_infer_types() {
INTERNAL_OP_SCOPE(internal_MOEExpert2_validate_and_infer_types);
OPENVINO_ASSERT(get_input_size() == 4, "MOEExpert2 must have 4 inputs whereas it has ", get_input_size());
OPENVINO_ASSERT(get_output_size() == 1, "MOEExpert2 must have 1 output whereas it has ", get_output_size());

set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
}

bool MOEExpert2::visit_attributes(ov::AttributeVisitor& visitor) {
INTERNAL_OP_SCOPE(internal_MOEExpert2_visit_attributes);
visitor.start_structure("config");

visitor.on_attribute("topk", m_config.topk);
visitor.on_attribute("expert_num", m_config.expert_num);
visitor.on_attribute("hidden_size", m_config.hidden_size);
visitor.on_attribute("expert_no", m_config.expert_no);
visitor.finish_structure();
OPENVINO_ASSERT(m_config.expert_num == m_body.size());
m_body.resize(m_config.expert_num);
for (size_t i = 0; i < m_config.expert_num; i++)
visitor.on_attribute("body" + std::to_string(i), m_body[i]);
return true;
}

} // namespace internal
} // namespace op
} // namespace ov
Loading