Skip to content

Commit 042ff32

Browse files
Copilottitaiwangms
andcommitted
Fix code review issues: use v_head_size and parameters.softcap
Co-authored-by: titaiwangms <18010845+titaiwangms@users.noreply.github.com>
1 parent 53c333f commit 042ff32

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

onnxruntime/core/providers/cuda/llm/attention.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
205205
gqa_parameters.num_heads = parameters.q_num_heads;
206206
gqa_parameters.head_size = parameters.head_size;
207207
gqa_parameters.v_head_size = parameters.v_head_size;
208-
gqa_parameters.kv_hidden_size = parameters.kv_num_heads * parameters.head_size;
208+
gqa_parameters.kv_hidden_size = parameters.kv_num_heads * parameters.v_head_size;
209209
gqa_parameters.kv_num_heads = parameters.kv_num_heads;
210210
gqa_parameters.rotary_dim = 0; // New Attention op doesn't use rotary embeddings directly
211211
gqa_parameters.is_unidirectional = parameters.is_causal;
@@ -218,7 +218,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
218218
gqa_parameters.rotary_interleaved = false;
219219
gqa_parameters.use_smooth_softmax = false;
220220
gqa_parameters.scale = parameters.scale;
221-
gqa_parameters.softcap = 0.0f;
221+
gqa_parameters.softcap = parameters.softcap;
222222
gqa_parameters.mask_type = onnxruntime::contrib::AttentionMaskType::MASK_NONE;
223223
gqa_parameters.qkv_format = contribop_parameters.qkv_format;
224224
gqa_parameters.past_kv_format = onnxruntime::contrib::AttentionQkvFormat::Q_K_V_BNSH;

0 commit comments

Comments
 (0)