Skip to content

Commit 03eccdc

Browse files
Merge pull request #470 from janhq/update-dev-from-master-2026-03-30-00-54
Sync master with upstream release b8580
2 parents 6152bd1 + 7c20367 commit 03eccdc

10 files changed

Lines changed: 454 additions & 79 deletions

File tree

.devops/intel.Dockerfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,23 @@ RUN mkdir -p /app/full \
3333

3434
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
3535

36+
ARG IGC_VERSION=v2.30.1
37+
ARG IGC_VERSION_FULL=2_2.30.1+20950
38+
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
39+
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
40+
ARG IGDGMM_VERSION=22.9.0
41+
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
42+
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
43+
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
44+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
45+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
46+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
47+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
48+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
49+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
50+
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
51+
&& dpkg --install *.deb
52+
3653
RUN apt-get update \
3754
&& apt-get install -y libgomp1 curl\
3855
&& apt autoremove -y \

examples/sycl/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA
2020
#cmake --build . --config Release --target llama-bench
2121

2222
#build all binary
23-
cmake --build . --config Release -j -v
23+
cmake --build . --config Release -j$((($(nproc)+1)/2)) -v

examples/sycl/run-llama2.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ if [ $# -gt 0 ]; then
2323
GGML_SYCL_DEVICE=$1
2424
echo "use $GGML_SYCL_DEVICE as main GPU"
2525
#use signle GPU only
26-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
26+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
2727

2828
else
2929
#use multiple GPUs with same max compute units
30-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
30+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
3131
fi

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,7 +2343,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
23432343
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
23442344
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
23452345
if (ggml_is_quantized(src0->type)) {
2346-
if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) {
2346+
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
2347+
if (ne2 <= mmvq_mmid_max) {
23472348
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
23482349
return;
23492350
}
@@ -2946,14 +2947,18 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
29462947
}
29472948

29482949
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
2949-
if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) {
2950-
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
2951-
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
2952-
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
2953-
use_cuda_graph = false;
2950+
if (node->op == GGML_OP_MUL_MAT_ID) {
2951+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2952+
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
2953+
if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
2954+
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
2955+
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
2956+
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
2957+
use_cuda_graph = false;
29542958
#ifndef NDEBUG
2955-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
2959+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
29562960
#endif
2961+
}
29572962
}
29582963

29592964
if (!use_cuda_graph) {

0 commit comments

Comments
 (0)