Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,37 +35,48 @@ struct moe_mask_gen : public primitive_base<moe_mask_gen> {
moe_mask_gen(const primitive_id& id,
const input_info& router_idx,
const int32_t num_total_experts,
const int32_t num_experts_per_token)
const int32_t num_experts_per_token,
const bool onednn_grouped_gemm_used = false)
: primitive_base(id, {router_idx}, 5),
num_total_experts(num_total_experts),
num_experts_per_token(num_experts_per_token) {}
num_experts_per_token(num_experts_per_token),
onednn_grouped_gemm_used(onednn_grouped_gemm_used) {}

int32_t num_total_experts = 0;
int32_t num_experts_per_token = 0;
bool onednn_grouped_gemm_used = false;

size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, num_total_experts);
seed = hash_combine(seed, num_experts_per_token);
return primitive::hash();
seed = hash_combine(seed, onednn_grouped_gemm_used);
return seed;
}

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;
return true;
if (auto rhs_casted = dynamic_cast<const moe_mask_gen*>(&rhs)) {
return num_total_experts == rhs_casted->num_total_experts &&
num_experts_per_token == rhs_casted->num_experts_per_token &&
onednn_grouped_gemm_used == rhs_casted->onednn_grouped_gemm_used;
}
return false;
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<moe_mask_gen>::save(ob);
ob << num_total_experts;
ob << num_experts_per_token;
ob << onednn_grouped_gemm_used;
}

void load(BinaryInputBuffer& ib) override {
primitive_base<moe_mask_gen>::load(ib);
ib >> num_total_experts;
ib >> num_experts_per_token;
ib >> onednn_grouped_gemm_used;
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,22 @@ struct moe_scatter_reduction : public primitive_base<moe_scatter_reduction> {
const input_info& experts_info_offsets,
const input_info& tokens_len_per_expert,
const input_info& experts_ids,
const ov::intel_gpu::op::MOECompressed::Config& moe_config)
const ov::intel_gpu::op::MOECompressed::Config& moe_config,
const bool onednn_grouped_gemm_used = false)
: primitive_base(id, {data, experts_per_token, expert_weights_per_token, tokens_per_expert, experts_info_offsets, tokens_len_per_expert, experts_ids}),
num_active_experts_per_token(static_cast<int32_t>(moe_config.top_k)),
has_batch_dim(moe_config.has_batch_dim) {}
has_batch_dim(moe_config.has_batch_dim),
onednn_grouped_gemm_used(onednn_grouped_gemm_used) {}

int32_t num_active_experts_per_token = 0;
bool has_batch_dim = true;
bool onednn_grouped_gemm_used = false;

size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, num_active_experts_per_token);
seed = hash_combine(seed, has_batch_dim);
seed = hash_combine(seed, onednn_grouped_gemm_used);
return seed;
}

Expand All @@ -54,19 +58,23 @@ struct moe_scatter_reduction : public primitive_base<moe_scatter_reduction> {

auto rhs_casted = downcast<const moe_scatter_reduction>(rhs);

return num_active_experts_per_token == rhs_casted.num_active_experts_per_token;
return num_active_experts_per_token == rhs_casted.num_active_experts_per_token &&
has_batch_dim == rhs_casted.has_batch_dim &&
onednn_grouped_gemm_used == rhs_casted.onednn_grouped_gemm_used;
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<moe_scatter_reduction>::save(ob);
ob << num_active_experts_per_token;
ob << has_batch_dim;
ob << onednn_grouped_gemm_used;
}

void load(BinaryInputBuffer& ib) override {
primitive_base<moe_scatter_reduction>::load(ib);
ib >> num_active_experts_per_token;
ib >> has_batch_dim;
ib >> onednn_grouped_gemm_used;
}
};
}
Expand Down
5 changes: 2 additions & 3 deletions src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include "gpu/intel/microkernels/fuser.hpp"
#include "gpu/intel/gemm/jit/include/gemmstone/microkernel/fuser.hpp"
#endif

namespace {
Expand Down Expand Up @@ -350,8 +350,7 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co
std::vector<uint8_t> binary = kernels[0]->get_binary();
kernels.clear();
// Update binary and rebuild kernel
using namespace dnnl::impl::gpu::intel;
micro::fuseMicrokernels(binary, combined_source.c_str());
gemmstone::microkernel::fuse(binary, combined_source.c_str());
_builder->build_kernels(binary.data(), binary.size(), KernelFormat::NATIVE_BIN, "", kernels);
#else // ENABLE_ONEDNN_FOR_GPU
OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ void MoE3GemmMicroGenerator::init_microkernels(const kernel_impl_params& params,
GPU_DEBUG_TRACE_DETAIL << "\t weight group size: " << group_size << "\n";

micro::GEMMProblem problem_moe;
micro::GEMMProtocol::Options opts_moe;
micro::GEMMOptions opts_moe;
opts_moe.slmPtr = true;
opts_moe.kParallelLocal = !is_prefill;
enum class MICRO_DIMENSIONALITY { NONE = -1, SCALAR = 0, VECTOR = 1, MATRIX = 2 };
Expand Down
104 changes: 0 additions & 104 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_gemm.cpp

This file was deleted.

104 changes: 0 additions & 104 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_gemm.hpp

This file was deleted.

Loading
Loading