Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/oneDNN"]
path = third_party/oneDNN
url = https://github.com/uxlfoundation/oneDNN.git
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)

# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")

Expand Down Expand Up @@ -66,6 +68,7 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
# Import torch cmake configuration.
find_package(Torch REQUIRED)

find_package(oneDNN REQUIRED)

#
# Forward the non-CUDA device extensions to external CMake scripts.
Expand Down Expand Up @@ -191,8 +194,10 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
endif()

if(ONEDNN_FOUND)
set(_ONEDNN_SRC)
file(GLOB _ONEDNN_SRC csrc/xpu/onednn/*.cpp)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about *.h?

list(APPEND VLLM_EXT_XPU_SRC
"csrc/xpu/onednn/*.cpp"
${_ONEDNN_SRC}
)
include_directories(${ONEDNN_INCLUDE_DIR})
link_libraries(${ONEDNN_LIBRARY})
Expand Down
26 changes: 23 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ python3 setup.py develop - will be local install if we use develop build or syst

On vllm side, we will `import vllm_xpu_kernels._C` at start time which should register all custom ops so we can directly use.

### prepare
### Prepare

Install oneapi 2025.1 deep learning essential [dependency](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html).

Expand All @@ -22,7 +22,7 @@ Create a new virtual env, install build dependency and torch dependency
pip install -r requirements.txt
```

### build & install
### Build & Install
Build development installation to current directory:

```
Expand All @@ -41,5 +41,25 @@ or build wheel (generated .whl in dist folder)
VLLM_TARGET_DEVICE=xpu python3 setup.py bdist_wheel
```

### how to use in vLLM
### How to use in vLLM
Please refer to temporary branch https://github.com/jikunshang/vllm/tree/xpu_kernel to install & test vllm which replaces `rms_norm` kernel from IPEX to vllm-xpu-kernels.

### Why Static Linking DNNL Instead of Shared Linking?

We chose to **statically link oneDNN (DNNL)** rather than using it as a shared library for the following reasons:

#### 1. **Version Compatibility**

Static linking ensures our application always uses the exact version of DNNL. With shared libraries, there's a risk that system-installed versions might be incompatible or introduce subtle bugs due to API/ABI changes.

#### 2. **Performance Consistency**

By linking statically, we avoid potential performance variability introduced by different builds or configurations of DNNL that might be present on the host system.

#### 3. **Avoiding Runtime Errors**
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe one more reason: torch-xpu also use static link. cc @rogerxfeng8

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated.


Using shared libraries requires correct paths and environment setup (`LD_LIBRARY_PATH` on Linux). Static linking avoids issues where DNNL cannot be found or loaded at runtime.

#### 4. **Aligning with PyTorch**

One key reason to use static linking is to maintain consistency with the PyTorch ecosystem. PyTorch itself statically links libraries like DNNL to ensure deterministic and reliable behavior across different environments.
71 changes: 71 additions & 0 deletions cmake/Modules/FindoneDNN.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# - Try to find oneDNN
#
# The following are set after configuration is done:
# ONEDNN_FOUND : set to true if oneDNN is found.
# ONEDNN_INCLUDE_DIR : path to oneDNN include dir.
# ONEDNN_LIBRARY : list of libraries for oneDNN
#

IF (NOT ONEDNN_FOUND)
SET(ONEDNN_FOUND OFF)

SET(ONEDNN_LIBRARY)
SET(ONEDNN_INCLUDE_DIR)
SET(DNNL_INCLUDES)

SET(THIRD_PARTY_DIR "${PROJECT_SOURCE_DIR}/third_party")
SET(ONEDNN_DIR "oneDNN")
SET(ONEDNN_ROOT "${THIRD_PARTY_DIR}/${ONEDNN_DIR}")

FIND_PATH(ONEDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${ONEDNN_ROOT} PATH_SUFFIXES include NO_DEFAULT_PATH)
IF(NOT ONEDNN_INCLUDE_DIR)
FIND_PACKAGE(Git)
IF(NOT Git_FOUND)
MESSAGE(FATAL_ERROR "Can not find Git executable!")
ENDIF()
EXECUTE_PROCESS(
COMMAND ${GIT_EXECUTABLE} submodule update --init ${ONEDNN_DIR}
WORKING_DIRECTORY ${THIRD_PARTY_DIR} COMMAND_ERROR_IS_FATAL ANY)
FIND_PATH(ONEDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${ONEDNN_ROOT} PATH_SUFFIXES include NO_DEFAULT_PATH)
ENDIF(NOT ONEDNN_INCLUDE_DIR)

IF(NOT ONEDNN_INCLUDE_DIR)
MESSAGE(FATAL_ERROR "oneDNN source files not found!")
ENDIF(NOT ONEDNN_INCLUDE_DIR)

SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "oneDNN sycl primitive cache" FORCE)

SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)

SET(DNNL_CPU_RUNTIME "THREADPOOL" CACHE STRING "oneDNN cpu backend" FORCE)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming this is a copy from onednn makefile. cpu runtime can be cleaned up.

Copy link
Copy Markdown
Collaborator Author

@zufangzhu zufangzhu Aug 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need set this, or the OneDNN will set this env to default OMP, which will add dependency to libiomp5.so.

SET(DNNL_GPU_RUNTIME "SYCL" CACHE STRING "oneDNN gpu backend" FORCE)
SET(DNNL_BUILD_TESTS FALSE CACHE BOOL "build with oneDNN tests" FORCE)
SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "build with oneDNN examples" FORCE)
SET(DNNL_ENABLE_CONCURRENT_EXEC TRUE CACHE BOOL "multi-thread primitive execution" FORCE)
SET(DNNL_EXPERIMENTAL TRUE CACHE BOOL "use one pass for oneDNN BatchNorm" FORCE)

ADD_SUBDIRECTORY(${ONEDNN_ROOT} oneDNN EXCLUDE_FROM_ALL)
SET(ONEDNN_LIBRARY ${DNNL_LIBRARY_NAME})
IF(NOT TARGET ${ONEDNN_LIBRARY})
MESSAGE(FATAL_ERROR "Failed to include oneDNN target")
ENDIF(NOT TARGET ${ONEDNN_LIBRARY})

IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
TARGET_COMPILE_OPTIONS(${ONEDNN_LIBRARY} PRIVATE -Wno-uninitialized)
TARGET_COMPILE_OPTIONS(${ONEDNN_LIBRARY} PRIVATE -Wno-strict-overflow)
TARGET_COMPILE_OPTIONS(${ONEDNN_LIBRARY} PRIVATE -Wno-error=strict-overflow)
ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)

TARGET_COMPILE_OPTIONS(${ONEDNN_LIBRARY} PRIVATE -Wno-tautological-compare)
GET_TARGET_PROPERTY(DNNL_INCLUDES ${ONEDNN_LIBRARY} INCLUDE_DIRECTORIES)
TARGET_LINK_LIBRARIES(${ONEDNN_LIBRARY} PRIVATE ze_loader)
list(APPEND ONEDNN_INCLUDE_DIR ${DNNL_INCLUDES})

# Upper level targets should not load header files from oneDNN's third party.
LIST(FILTER ONEDNN_INCLUDE_DIR EXCLUDE REGEX
".*third_party/oneDNN/third_party.*")

SET(ONEDNN_FOUND ON)
MESSAGE(STATUS "Found oneDNN: TRUE")

ENDIF(NOT ONEDNN_FOUND)
54 changes: 54 additions & 0 deletions csrc/xpu/onednn/fp8_gemm_w8a16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include <vector>
#include "fp8_gemm_w8a16.h"

torch::Tensor fp8_gemm_w8a16(const torch::Tensor& A, const torch::Tensor& B,
bool trans_B,
const std::optional<torch::Tensor>& B_scale_,
const std::optional<torch::Tensor>& bias_) {
TORCH_CHECK(A.dim() == 2 || A.dim() == 3,
"fp8_gemm_w8a16 only support 2D and 3D inputs!\n");
TORCH_CHECK(B.dim() == 2, "fp8_gemm_w8a16 only support 2D weights!\n");

std::vector<int64_t> result_shape;
if (A.dim() == 2) {
if (trans_B) {
result_shape = {A.size(0), B.size(0)};
} else {
result_shape = {A.size(0), B.size(1)};
}
// src{m, k}, wei{k, n}, bias{n}, dst{m, n}
} else {
if (trans_B) {
result_shape = {A.size(0), A.size(1), B.size(0)};
} else {
result_shape = {A.size(0), A.size(1), B.size(1)};
}
// src{b, m, k}, wei{k, n}, bias{n}, dst{b, m, n}
}

// deal with input shape [m, b, k] stride [k, m * k, 1]
auto k = A.size(A.dim() - 1);
auto n = result_shape.back();
auto res_stride = A.strides().vec();
for (int i = 0; i < res_stride.size() - 1; i++) {
res_stride[i] = res_stride[i] / k * n;
}

torch::Tensor result =
at::empty_strided(result_shape, res_stride, A.options());

// check if nt format
bool is_nt = true;
if (trans_B) {
is_nt = B.strides()[B.dim() - 1] == 1;
} else {
is_nt = B.strides()[B.dim() - 2] == 1;
}

torch::Tensor B_scale = B_scale_.has_value()
? B_scale_.value()
: at::ones({1}, B.options().dtype(A.dtype()));

oneDNN::dnnl_matmul_w8a16_fp8(result, A, B, is_nt, bias_, B_scale);
return result;
}
133 changes: 133 additions & 0 deletions csrc/xpu/onednn/fp8_gemm_w8a16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#pragma once

#include <c10/xpu/XPUStream.h>
#include <dnnl.hpp>
#include <torch/torch.h>

#include "onednn_ext.h"

namespace oneDNN {

using bias_type_t = at::native::onednn::bias_type_t;
using trans_type_t = at::native::onednn::trans_type_t;
using GpuStreamManager = at::native::onednn::GpuStreamManager;
using GpuEngineManager = at::native::onednn::GpuEngineManager;

static inline void dnnl_matmul_w8a16_fp8(
torch::Tensor& result, const torch::Tensor& mat1, const torch::Tensor& mat2,
bool trans_b, const std::optional<torch::Tensor>& bias,
const torch::Tensor& m2_sc, const int64_t group_size = 0) {
TORCH_CHECK(mat2.scalar_type() == at::ScalarType::Float8_e5m2 ||
mat2.scalar_type() == at::ScalarType::Float8_e4m3fn,
"weight must be f8_e5m2 or f8_e4m3fn for fp8 matmul");
auto src_sz = mat1.sizes();
auto o_sz = result.sizes();

const int m = std::reduce(src_sz.begin(), src_sz.end() - 1, 1,
std::multiplies<int64_t>());
const int n = o_sz.back(); // presume channel last format
const int k = *(src_sz.end() - 1);

// get joint dtypes
joint_dtypes_t jd;
auto in_dtype = mat1.scalar_type();
auto wei_dtype = mat2.scalar_type();
if (in_dtype == at::ScalarType::Half) {
jd = wei_dtype == at::ScalarType::Float8_e5m2 ? joint_dtypes_t::f16_f8_e5m2
: joint_dtypes_t::f16_f8_e4m3;
} else if (in_dtype == at::ScalarType::BFloat16) {
jd = wei_dtype == at::ScalarType::Float8_e5m2
? joint_dtypes_t::bf16_f8_e5m2
: joint_dtypes_t::bf16_f8_e4m3;
} else {
TORCH_INTERNAL_ASSERT(
false, "Unsupported data type for fp8 matmul: ", mat1.scalar_type());
}

// get bias type
bias_type_t b_type;
if (bias.has_value() && bias.value().defined()) {
auto& b = bias.value();
const auto nuelm = b.numel();
if (nuelm == 1) {
b_type = bias_type_t::scalar;
} else if (nuelm == m * n) {
b_type = bias_type_t::mn;
} else if (b.size(b.dim() - 1) == n && nuelm == n) {
b_type = bias_type_t::n;
} else if (b.size(b.dim() - 1) == 1 && nuelm == m) {
b_type = bias_type_t::m;
} else if (nuelm == 0) {
b_type = bias_type_t::none;
} else {
TORCH_CHECK(0, "unsupported bias dim in matmul ...", b.sizes());
}
} else {
b_type = bias_type_t::none;
}

trans_type_t tt = trans_type_t::nn;
if (trans_b) {
// transpose mat2
tt = trans_type_t::nt;
}

// get lda ldb and ldc
auto mat1_strides = mat1.strides();
int64_t leading_dim = -1;
if (mat1.dim() == 2) {
leading_dim = 0;
} else if (mat1.dim() == 3) {
leading_dim = mat1_strides[0] < mat1_strides[1] ? 0 : 1;
} else {
TORCH_CHECK(false,
"Unsupported input dimension for fp8 matmul: ", mat1.dim());
}
int64_t lda = mat1_strides[leading_dim];
int64_t ldb = mat2.strides()[mat2.dim() - 1] == 1
? mat2.strides()[mat2.dim() - 2]
: mat2.strides()[mat2.dim() - 1];
int64_t ldc = result.strides()[leading_dim];

auto f_attr = [&](dnnl::primitive_attr& pattr) {
pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
};

int arg_off = 0;

// ************************************************************
// get device, engine, stream
const int dev_id = c10::xpu::getCurrentXPUStream().device_index();
at::Device curDevice = at::Device(at::kXPU, dev_id);
auto engine = GpuEngineManager::Instance().get_engine(curDevice);

auto& matmul_ext = matmul_primitive_create_and_cache(
jd, tt, b_type, m, n, k, lda, ldb, ldc, dev_id, f_attr, group_size);

matmul_ext.set_attribute(arg_off++, DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS,
m2_sc.data_ptr(), [&]() {
return at::native::onednn::make_onednn_memory(
get_onednn_md(m2_sc), engine,
m2_sc.data_ptr());
});

std::vector<std::pair<int, void*>> arg_handles;
arg_handles.reserve(8);

arg_handles.emplace_back(DNNL_ARG_SRC, mat1.data_ptr());
arg_handles.emplace_back(DNNL_ARG_WEIGHTS, mat2.data_ptr());
arg_handles.emplace_back(DNNL_ARG_DST, result.data_ptr());
if (b_type != bias_type_t::none) {
arg_handles.emplace_back(DNNL_ARG_BIAS, bias.value().data_ptr());
}

int scratchpad_size = matmul_ext.get_scratchpad_size();
torch::Tensor scratchpad_tensor = at::empty(
{scratchpad_size}, mat1.options().dtype(at::kByte), c10::nullopt);
arg_handles.emplace_back(DNNL_ARG_SCRATCHPAD, scratchpad_tensor.data_ptr());

auto& strm = GpuStreamManager::Instance().get_stream();
auto qfp8_matmul_event =
matmul_ext.execute(strm, engine, std::move(arg_handles), arg_off);
}
} // namespace oneDNN
Loading